Spaces:

CognizantAI
/

IntelAnalyser

Sleeping

App Files Files Community

ashischakraborty commited on Feb 10, 2025

Commit

3daab2e

verified ·

1 Parent(s): 927b503

UC#3 first upload

Browse files

Files changed (3) hide show

app.py +209 -0
azure_openai.py +349 -0
helper_functions.py +28 -0

app.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import streamlit as st
+import PyPDF2
+import pandas as pd
+import uuid
+import tiktoken
+import re
+from datetime import datetime
+from helper_functions import extract_text_from_pdf
+from azure_openai import evaluation_process, process_insight, process_compare, risk_score_process, process_chunks
+def count_tokens_with_tiktoken(text):
+    """
+    Counts the number of tokens in a given text using tiktoken.
+    :param text: Input text.
+    :return: Token count.
+    """
+    tokenizer = tiktoken.get_encoding("cl100k_base")
+    tokens = tokenizer.encode(text)
+    return len(tokens), tokens
+def split_text_into_chunks_with_tiktoken_and_pages(text, page_texts, chunk_size, overlap):
+    """
+    Splits text into chunks based on a specified chunk size in tokens and overlap using tiktoken.
+    Tracks start and end page numbers for each chunk.
+    :param text: Combined text of the document.
+    :param page_texts: List of tuples [(page_number, page_text), ...].
+    :param chunk_size: Maximum size of each chunk in tokens.
+    :param overlap: Number of overlapping tokens between consecutive chunks.
+    :return: List of dictionaries representing chunks with start and end pages.
+    """
+    _, tokens = count_tokens_with_tiktoken(text)
+    chunks = []
+    # Map token positions to page numbers
+    page_token_map = []
+    tokenizer = tiktoken.get_encoding("cl100k_base")
+    for page_number, page_text in page_texts:
+        page_tokens = tokenizer.encode(page_text)
+        page_token_map.extend([page_number] * len(page_tokens))
+    for start in range(0, len(tokens), chunk_size - overlap):
+        end = min(start + chunk_size, len(tokens))
+        chunk_tokens = tokens[start:end]
+        chunk_text = tokenizer.decode(chunk_tokens)
+        # Determine start and end pages
+        start_page = page_token_map[start] if start < len(page_token_map) else None
+        end_page = page_token_map[end - 1] if end - 1 < len(page_token_map) else page_texts[-1][0]
+        chunks.append({
+            "ChunkText": chunk_text,
+            "TokenCount": len(chunk_tokens),
+            "StartPage": start_page,
+            "EndPage": end_page,
+            "ChunkID": str(uuid.uuid4())
+        })
+    return chunks
+def split_dataframe_with_combined_text_and_pages(df, chunk_size, overlap):
+    """
+    Splits the combined text of a DataFrame into chunks using tiktoken.
+    Each chunk will include start and end page numbers.
+    :param df: DataFrame with columns ['Title', 'Text', 'PageTexts'].
+    :param chunk_size: The maximum size of each chunk in tokens.
+    :param overlap: The number of overlapping tokens between consecutive chunks.
+    :return: DataFrame with columns ['ChunkText', 'TokenCount', 'StartPage', 'EndPage', 'ChunkID'].
+    """
+    chunks = []
+    for _, row in df.iterrows():
+        text = row['Text']
+        page_texts = row['PageTexts']
+        split_chunks = split_text_into_chunks_with_tiktoken_and_pages(text, page_texts, chunk_size, overlap)
+        chunks.extend(split_chunks)
+    return pd.DataFrame(chunks)
+def main():
+    st.set_page_config(page_title="RegIntel Risk Analyser", page_icon=":vertical_traffic_light:")
+    st.title("External RegIntel Risk Analyser :vertical_traffic_light:")
+    topic = st.selectbox("Please choose a focus for the system",("Labelling",
+                                                                 "Institutional Review Board/Independent Ethics Committee",
+                                                                 "Investigator", "Sponsor",
+                                                                 "Clinical Trial Protocol and protocol amendments",
+                                                                 "Investigator's Brochure", "Conduct of Clinical Trial",
+                                                                 "Monitoring", "Auditing",
+                                                                 "Data handling and record keeping",
+                                                                 "clinical trial reports",
+                                                                 "Responsibilities of the Sponsor and Investigator",
+                                                                 "Sponsor Inspection Preparation"),)
+    uploaded_extintl_file_insight = st.file_uploader("Upload a External Reg Intel", type="pdf")
+    uploaded_interintel_file_insight = st.file_uploader("Upload a External Reg Intel", type="pdf")
+    if uploaded_extintl_file_insight is not None and uploaded_interintel_file_insight is not None:
+        uploaded_file_SOP = st.file_uploader("Upload an SOP file", type="pdf")
+        if uploaded_file_SOP is not None:
+            # Extract insight document
+            with st.spinner("Processing External Reg Intel"):
+                ext_intl_text_insight, ext_intl_page_texts_insight = extract_text_from_pdf(uploaded_extintl_file_insight)
+                token_count_insight, _ = count_tokens_with_tiktoken(ext_intl_text_insight)
+            st.sidebar.success("External Reg Intel file successfully processed")
+            st.write("Token Count")
+            st.write(f"The PDF contains **{token_count_insight}** tokens.")
+            with st.spinner("Processing Internal Reg Intel"):
+                int_intl_text_insight, int_intl_page_texts_insight = extract_text_from_pdf(uploaded_interintel_file_insight)
+                token_count_insight, _ = count_tokens_with_tiktoken(int_intl_text_insight)
+            st.sidebar.success("External Reg Intel file successfully processed")
+            st.write("Token Count")
+            st.write(f"The PDF contains **{token_count_insight}** tokens.")
+            # Extract SOP document
+            with st.spinner("Processing the SOP Text..."):
+                text_SOP, page_texts_SOP = extract_text_from_pdf(uploaded_file_SOP)
+                token_count_SOP, _ = count_tokens_with_tiktoken(text_SOP)
+            st.sidebar.success("SOP file successfully processed")
+            st.write("Token Count")
+            st.write(f"The PDF contains **{token_count_SOP}** tokens.")
+            # Process external insight Insights into chunks
+            with st.spinner("Processing the Insight Document..."):
+                df_ei_input_insight = pd.DataFrame([{ "Title": uploaded_extintl_file_insight.name, "Text": ext_intl_text_insight, "PageTexts": ext_intl_page_texts_insight }])
+                df_ei_insight_chunks = split_dataframe_with_combined_text_and_pages(df_ei_input_insight, 10000, 1000)
+            st.write("Processed External Reg Intel")
+            st.sidebar.success("Processed External Reg Intel")
+            st.write(df_ei_insight_chunks)
+            # Process internal insight Insights into chunks
+            with st.spinner("Processing the Insight Document..."):
+                df_ii_input_insight = pd.DataFrame([{ "Title": uploaded_interintel_file_insight.name, "Text": int_intl_text_insight, "PageTexts": int_intl_page_texts_insight }])
+                df_ii_insight_chunks = split_dataframe_with_combined_text_and_pages(df_ii_input_insight, 10000, 1000)
+            st.write("Processed External Reg Intel")
+            st.sidebar.success("Processed External Reg Intel")
+            st.write(df_ii_insight_chunks)
+            # Process SOP into chunks
+            with st.spinner("Processing the SOP Document..."):
+                df_input_SOP = pd.DataFrame([{ "Title": uploaded_file_SOP.name, "Text": text_SOP, "PageTexts": page_texts_SOP }])
+                df_sop_chunks = split_dataframe_with_combined_text_and_pages(df_input_SOP, 10000, 1000)
+            st.write("Processed SOP")
+            st.sidebar.success("Processed SOP")
+            st.write(df_sop_chunks)
+            # Evaluate Document
+            with st.spinner("Evaluating document"):
+                df_ei_eval, ei_con, ei_score = evaluation_process(df_ei_insight_chunks, topic,"ext")
+                ei_score["source"]="external intel"
+                df_ei_eval["source"]="external intel"
+                df_ii_eval, ii_con, ii_score = evaluation_process(df_ii_insight_chunks, topic,"intl")
+                ii_score["source"]="internal intel"
+                df_ii_eval["source"]="internal intel"
+                score = pd.concat([ei_score, ii_score])
+            st.write("External Inteligence Evaluation")
+            st.sidebar.success(f"Evaluation Concensus: {ei_con}")
+            st.write(f"Evaluation Concensus: {ei_con}")
+            st.write("Evaluation Scores:")
+            st.write(score)
+            if ei_con == "False" and ii_con == "False":
+                st.sidebar.error("Document Not Relevant To Topic")
+                st.write("Document Not Relevant To Topic")
+                st.write("Exiting RegIntel Analysis")
+                return
+            # Generate Insights
+            with st.spinner("Creating insights"):
+                 df_ei_insights = process_chunks(df_ei_insight_chunks, topic,"ext")
+                 df_ii_insights = process_chunks(df_ii_insight_chunks, topic,"intl")
+                 df_ei_insights["source"]="external intel"
+                 df_ii_insights["source"]="internal intel"
+                 df_insights = pd.concat([df_ei_insights, df_ii_insights])
+            st.subheader("External Inteligence Insights")
+            st.sidebar.success("External Inteligence Insights Created")
+            st.write(df_insights)
+            filtered_insights_on_impact = df_insights[df_insights['classification'] == 'impact']
+            if filtered_insights_on_impact.empty:
+                st.write("No impact insights")
+                st.sidebar.error("No impact insights")
+                return
+            # Comparing to Insights
+            with st.spinner("Comparing Impact Classified Insights To SOP"):
+                 df_compare = process_compare(filtered_insights_on_impact, df_sop_chunks, topic)
+            st.subheader("Comparison of Insights to SOP's")
+            st.sidebar.success("Comparison of External Intel to SOP's Complete")
+            st.write(df_compare)
+            filtered_comparisons_df = df_compare[df_compare['ReviewNeeded'] == True]
+            if filtered_comparisons_df.empty:
+                st.write("No reviews needed for this SOP")
+                st.sidebar.error("No reviews needed for this SOP")
+                return
+            # Risk scoring
+            with st.spinner("Risk Assessing Insights To SOP"):
+                 df_risks = risk_score_process(filtered_comparisons_df, topic)
+            st.subheader("Risk Score of Insights to SOP's")
+            st.sidebar.success("Risk Score of Insights to SOP's Completed")
+            st.write(df_risks)
+if __name__ == "__main__":
+    main()

azure_openai.py ADDED Viewed

	@@ -0,0 +1,349 @@

+import streamlit as st
+import os
+import pandas as pd
+# from langchain.chat_models import AzureChatOpenAI
+from langchain_openai import AzureChatOpenAI
+from langchain_core.output_parsers import StrOutputParser, PydanticOutputParser
+from langchain_core.prompts.chat import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
+from pydantic import BaseModel, Field, validator
+from langchain.output_parsers.enum import EnumOutputParser
+from langchain_core.prompts import PromptTemplate
+from enum import Enum
+os.environ["LANGCHAIN_TRACING_V2"]="true"
+os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
+LANGCHAIN_API_KEY = st.secrets['LANGCHAIN_API_KEY']
+os.environ["LANGCHAIN_PROJECT"]="UC2e2e"
+# LLM Langchain Definition
+OPENAI_API_KEY = st.secrets['OPENAI_API_KEY']
+OPENAI_API_TYPE = "azure"
+OPENAI_API_BASE = "https://davidfearn-gpt4.openai.azure.com"
+OPENAI_API_VERSION = "2024-08-01-preview"
+OPENAI_MODEL = "gpt-4o-mini"
+# Function to read file contents
+def read_file(file):
+    """
+    Reads the content of a text file and returns it as a string.
+    :param file: The file name to read from the 'assets' directory.
+    :return: The content of the file as a string or None if an error occurs.
+    """
+    fp = f"assets/{file}.md"
+    try:
+        with open(fp, 'r', encoding='utf-8') as file:
+            content = file.read()
+        return content
+    except FileNotFoundError:
+        print(f"The file at {fp} was not found.")
+    except IOError:
+        print(f"An error occurred while reading the file at {fp}.")
+    return None
+# Function to generate structured insights
+def process_insight(chunk, topic,source):
+    GSKGlossary = read_file("GSKGlossary")
+    if source== "intl":
+        SystemMessage = read_file("intl_insight_system_message")
+        UserMessage = read_file("intl_insight_user_message")
+    else:
+        SystemMessage = read_file("ext_insight_system_message")
+        UserMessage = read_file("ext_insight_user_message")
+    class Insights(BaseModel):
+        completed: bool = Field(description="This field is used to indicate that you think the number of insights has been completed")
+        insight: str = Field(description="This field is used to return the MECE insight in string format")
+    llm = AzureChatOpenAI(
+        openai_api_version=OPENAI_API_VERSION,
+        openai_api_key=OPENAI_API_KEY,
+        azure_endpoint=OPENAI_API_BASE,
+        openai_api_type=OPENAI_API_TYPE,
+        deployment_name=OPENAI_MODEL,
+        temperature=0,
+    )
+    system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage)
+    structured_llm = llm.with_structured_output(Insights)
+    prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage])
+    chain = prompt | structured_llm
+    new_insights = []
+    insights_data = []
+    while True:
+        # Invoke the LLM with the current chunk and existing insights
+        counter = 5 - len(new_insights)
+        new_insight_response = chain.invoke({"chunk": chunk, "existing_insights": new_insights, "counter": counter, "GSKGlossary": GSKGlossary, "topic":topic})
+        classification = selectClass(new_insight_response.insight)
+        # Append the new insight to the list
+        new_insights.append(new_insight_response.insight)
+        insights_data.append({
+            # "completed": new_insight_response.completed,
+            "classification": classification,
+            "insight": new_insight_response.insight,
+            "chunk": chunk
+        })
+        # Check if "completed" is True or the list of "new_insights" is >= 3
+        if new_insight_response.completed and len(new_insights) >= 3:
+            return pd.DataFrame(insights_data)
+        # If the list of "new_insights" reaches 5, return the list
+        if len(new_insights) == 5:
+            return pd.DataFrame(insights_data)
+def selectClass(insight):
+    classification_system_message = read_file("classification_system_message")
+    classification_user_message = read_file("classification_user_message")
+    class InsightClassification(Enum):
+        IMPACT = "impact"
+        CONSULTATION = "consultation"
+        AWARENESS = "awareness"
+    llm = AzureChatOpenAI(
+        openai_api_version=OPENAI_API_VERSION,
+        openai_api_key=OPENAI_API_KEY,
+        azure_endpoint=OPENAI_API_BASE,
+        openai_api_type=OPENAI_API_TYPE,
+        deployment_name=OPENAI_MODEL,
+        temperature=0,
+    )
+    parser = EnumOutputParser(enum=InsightClassification)
+    system_message_template = SystemMessagePromptTemplate.from_template(classification_system_message)
+# structured_llm = llm.with_structured_output(Insights)
+    prompt = ChatPromptTemplate.from_messages([system_message_template, classification_user_message]).partial(options=parser.get_format_instructions())
+    chain = prompt | llm | parser
+    result = chain.invoke({"insight": insight})
+    return result.value
+def process_chunks(chunk, topic,source):
+    """
+    Processes chunks from a specific dataframe column, invokes the get_structured function for each chunk,
+    and combines the resulting dataframes into one dataframe.
+    :param df: The dataframe containing chunks.
+    :param temp: Temperature parameter for the LLM.
+    :param SystemMessage: System message template.
+    :param UserMessage: User message template.
+    :param completedMessage: Completion message description.
+    :param insightMessage: Insight message description.
+    :param chunk_column: The name of the column containing text chunks to process.
+    :return: A combined dataframe of insights from all chunks.
+    """
+    all_insights = []
+    for chunk in chunk["ChunkText"]:
+        insights_df = process_insight(chunk, topic,source)
+        all_insights.append(insights_df)
+    return pd.concat(all_insights, ignore_index=True)
+def evaluation_llm(chunk, topic , source):
+    GSKGlossary = read_file("GSKGlossary")
+    if source == "intl":
+        SystemMessage = read_file("intl_eval_system_message")
+        UserMessage = read_file("intl_eval_user_message")
+    else:
+        SystemMessage = read_file("ext_eval_system_message")
+        UserMessage = read_file("ext_eval_user_message")
+    class Evaluate(BaseModel):
+        decision: bool = Field(description="True: The content of the document relates to the topic.False: The content of the document does not relate to the topic.")
+        justification: str = Field(description="Please justify your decision in a logical and structured way.")
+    llm = AzureChatOpenAI(
+        openai_api_version=OPENAI_API_VERSION,
+        openai_api_key=OPENAI_API_KEY,
+        azure_endpoint=OPENAI_API_BASE,
+        openai_api_type=OPENAI_API_TYPE,
+        deployment_name=OPENAI_MODEL,
+        temperature=0,
+    )
+    system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage)
+    structured_llm = llm.with_structured_output(Evaluate)
+    # Create a chat prompt template combining system and human messages
+    prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage])
+    chain = prompt | structured_llm
+    return chain.invoke({
+        "chunk": chunk,
+        "topic": topic,
+        "GSKGlossary": GSKGlossary
+    })
+def evaluation_process(df_chunks, topic,source):
+    """
+    Iterates over chunks in the DataFrame and processes them using `get_structured`.
+    :param df_chunks: DataFrame containing chunks.
+    :param systemMessage: System message for evaluation.
+    :param userMessage: User message template for evaluation.
+    :param temp: Temperature setting for the model.
+    :param decisionMessage: Description for decision field.
+    :param justificationMessage: Description for justification field.
+    :return: Updated DataFrame with decision and justification columns and consensus value.
+    """
+    decisions = []
+    justifications = []
+    # Avoid re-inserting columns if they already exist
+    if "Decision" in df_chunks.columns:
+        df_chunks = df_chunks.drop(columns=["Decision", "Justification"])
+    for _, chunk in df_chunks.iterrows():
+        result = evaluation_llm(chunk['ChunkText'], topic,source)
+        decisions.append("True" if result.decision else "False")  # Convert bool to string
+        justifications.append(result.justification)
+    # Add new columns to the DataFrame
+    df_chunks.insert(0, "Decision", decisions)
+    df_chunks.insert(1, "Justification", justifications)
+    # Count all True/False values for consensus and get most frequent value
+    consensus_count = df_chunks["Decision"].value_counts()
+    consensus_value = consensus_count.idxmax()  # Most frequently occurring value
+    return df_chunks, consensus_value, consensus_count
+def process_compare(insight_df, sopChunk_df, topic):
+    GSKGlossary = read_file("GSKGlossary")
+    SystemMessage = read_file("compare_system_message")
+    UserMessage = read_file("compare_user_message")
+    # Define the structured output model
+    class Compare(BaseModel):
+        review: bool = Field(description="This field is used to indicate whether a review is needed")
+        justification: str = Field(description="This field is used to justify why a review is needed")
+    # Initialize the LLM
+    llm = AzureChatOpenAI(
+        openai_api_version=OPENAI_API_VERSION,
+        openai_api_key=OPENAI_API_KEY,
+        azure_endpoint=OPENAI_API_BASE,
+        openai_api_type=OPENAI_API_TYPE,
+        deployment_name=OPENAI_MODEL,
+        temperature=0,
+    )
+    # Create the structured output and prompt chain
+    system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage)
+    structured_llm = llm.with_structured_output(Compare)
+    prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage])
+    chain = prompt | structured_llm
+    compare_data = []
+    # Iterate over sopChunk_df and insight_df to process "ChunkText" and "insight"
+    for sopChunk_index, sopChunk_row in sopChunk_df.iterrows():
+        sop_chunk_text = sopChunk_row["ChunkText"]  # Extract the ChunkText column
+        for insight_index, insight_row in insight_df.iterrows():
+            insight_text = insight_row["insight"]  # Extract the insight column
+            # Invoke the LLM with the extracted data
+            compare_response = chain.invoke({
+                "sopChunk": sop_chunk_text,
+                "insight": insight_text,
+                "topic": topic,
+                "GSKGlossary": GSKGlossary
+            })
+            # Append the response to insights_data
+            compare_data.append({
+                "ReviewNeeded": compare_response.review,
+                "Justification": compare_response.justification,
+                "SOP": sop_chunk_text,
+                "Insight": insight_text
+            })
+    # Return the insights as a single DataFrame
+    print(compare_data)
+    return pd.DataFrame(compare_data)
+def risk_score_process(compare_df, topic):
+    GSKGlossary = read_file("GSKGlossary")
+    SystemMessage = read_file("risk_scoring_system_message")
+    UserMessage = read_file("risk_scoring_user_message")
+# Define the Enum for predefined options
+    class RiskClassification(str, Enum):
+        HIGH = "high"
+        MEDIUM = "medium"
+        LOW = "low"
+    # Define the Pydantic model for the structured output
+    class Risk(BaseModel):
+        risk_level: RiskClassification = Field(
+            description="The selected classification option."
+        )
+        justification: str = Field(
+            description="Justify the reason for choosing this risk classification."
+        )
+        advice: str = Field(
+            description="Suggestions for changes that could be made to the standard operating procedure to mitigat the risk."
+        )
+    llm = AzureChatOpenAI(
+        openai_api_version=OPENAI_API_VERSION,
+        openai_api_key=OPENAI_API_KEY,
+        azure_endpoint=OPENAI_API_BASE,
+        openai_api_type=OPENAI_API_TYPE,
+        deployment_name=OPENAI_MODEL,
+        temperature=0,
+    )
+    system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage)
+    structured_llm = llm.with_structured_output(Risk)
+    prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage])
+    chain = prompt | structured_llm
+    risk_data = []
+    # Iterate over sopChunk_df and insight_df to process "ChunkText" and "insight"
+    for index, row in compare_df.iterrows():
+            # Invoke the LLM with the extracted data
+            risk_response = chain.invoke({
+                "comparison": row['Justification'],
+                "insight": row['Insight'],
+                "SOPchunk":row['SOP'],
+                "topic": topic
+            })
+            # Append the response to insights_data
+            risk_data.append({
+                "RiskLevel": risk_response.risk_level,
+                "Justification": risk_response.justification,
+                "advice": risk_response.advice,
+                "comparison": row['Justification'],
+                "insight": row['Insight'],
+                "SOPchunk":row['SOP']
+            })
+    # Return the insights as a single DataFrame
+    return pd.DataFrame(risk_data)

helper_functions.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import PyPDF2
+import tiktoken
+# Ensure NLTK resources are available
+def extract_text_from_pdf(file):
+    """
+    Extracts text from a PDF file and tracks text by page.
+    :param file: Uploaded PDF file object.
+    :return: Tuple (text, page_texts), where:
+             - text is the combined text of the entire PDF.
+             - page_texts is a list of tuples [(page_number, page_text), ...].
+    """
+    pdf_reader = PyPDF2.PdfReader(file)
+    text = ""
+    page_texts = []
+    for i, page in enumerate(pdf_reader.pages):
+        page_content = page.extract_text()
+        text += page_content
+        page_texts.append((i + 1, page_content))  # Track page numbers (1-indexed)
+    return text, page_texts
+def count_tokens(string: str) -> int:
+    """Returns the number of tokens in a text string."""
+    encoding = tiktoken.get_encoding("o200k_base")
+    return len(encoding.encode(string))