Spaces:

drewgenai
/

protocol-sync

Sleeping

App Files Files Community

drewgenai commited on Mar 11, 2025

Commit

078c1b4

1 Parent(s): f0c5aed

initial commit

Browse files

Files changed (15) hide show

.gitignore +12 -0
Dockerfile +30 -0
app.py +359 -0
chainlit.md +2 -0
data/Instruments_Definitions.xlsx +0 -0
example_files/Instruments_Definitions.xlsx +0 -0
example_files/docx/Protocol_NOAPS v1.0.docx +0 -0
example_files/docx/Protocol_PKAS v1.0.docx +0 -0
example_files/docx/Protocol_PPMT v1.0.docx +0 -0
example_files/pdf/Protocol_NOAPS v1.0.pdf +0 -0
example_files/pdf/Protocol_PKAS v1.0.pdf +0 -0
example_files/pdf/Protocol_PPMT v1.0.pdf +0 -0
pyproject.toml +38 -0
requirements.txt +210 -0
uv.lock +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,12 @@

+__pycache__/
+.chainlit/
+.venv/
+.env
+/output/
+/upload/
+*.jsonl
+/models/
+*z*.py
+*z*.md
+*z*.ipynb
+/z*

Dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+# Get a distribution that has uv already installed
+FROM ghcr.io/astral-sh/uv:python3.13-bookworm-slim
+# Add user - this is the user that will run the app
+# If you do not set user, the app will run as root (undesirable)
+RUN useradd -m -u 1000 user
+USER user
+# Set the home directory and path
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+ENV UVICORN_WS_PROTOCOL=websockets
+# Set the working directory
+WORKDIR $HOME/app
+# Copy the app to the container
+COPY --chown=user . $HOME/app
+# Install the dependencies
+# RUN uv sync --frozen
+RUN uv sync
+# Expose the port
+EXPOSE 7860
+# Run the app
+CMD ["uv", "run", "chainlit", "run", "app.py", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,359 @@

+import os
+import shutil
+import json
+import pandas as pd
+import chainlit as cl
+from dotenv import load_dotenv
+from langchain_core.documents import Document
+from langchain_community.document_loaders import PyMuPDFLoader
+from langchain_experimental.text_splitter import SemanticChunker
+from langchain_community.vectorstores import Qdrant
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_core.output_parsers import StrOutputParser
+from langchain_openai import ChatOpenAI
+from langchain_core.prompts import ChatPromptTemplate
+from langchain.tools import tool
+from langchain.schema import HumanMessage
+from typing_extensions import List, TypedDict
+from operator import itemgetter
+from langchain.agents import AgentExecutor, create_openai_tools_agent
+from langchain_core.prompts import MessagesPlaceholder
+from qdrant_client import QdrantClient
+from qdrant_client.models import VectorParams, Distance
+load_dotenv()
+UPLOAD_PATH = "upload/"
+OUTPUT_PATH = "output/"
+INITIAL_DATA_PATH = "./data/Instruments_Definitions.xlsx"
+os.makedirs(UPLOAD_PATH, exist_ok=True)
+os.makedirs(OUTPUT_PATH, exist_ok=True)
+# Initialize embeddings model
+model_id = "Snowflake/snowflake-arctic-embed-m"
+embedding_model = HuggingFaceEmbeddings(model_name=model_id)
+semantic_splitter = SemanticChunker(embedding_model, add_start_index=True, buffer_size=30)
+llm = ChatOpenAI(model="gpt-4o-mini")
+# Export comparison prompt
+export_prompt = """
+CONTEXT:
+{context}
+QUERY:
+{question}
+You are a helpful assistant. Use the available context to answer the question.
+Between these two files containing protocols, identify and match **entire assessment sections** based on conceptual similarity. Do NOT match individual questions.
+### **Output Format:**
+Return the response in **valid JSON format** structured as a list of dictionaries, where each dictionary contains:
+[
+    {{
+        "Derived Description": "A short name for the matched concept",
+        "Protocol_1": "Protocol 1 - Matching Element",
+        "Protocol_2": "Protocol 2 - Matching Element"
+    }},
+    ...
+]
+### **Example Output:**
+[
+    {{
+        "Derived Description": "Pain Coping Strategies",
+        "Protocol_1": "Pain Coping Strategy Scale (PCSS-9)",
+        "Protocol_2": "Chronic Pain Adjustment Index (CPAI-10)"
+    }},
+    {{
+        "Derived Description": "Work Stress and Fatigue",
+        "Protocol_1": "Work-Related Stress Scale (WRSS-8)",
+        "Protocol_2": "Occupational Fatigue Index (OFI-7)"
+    }},
+    ...
+]
+### Rules:
+1. Only output **valid JSON** with no explanations, summaries, or markdown formatting.
+2. Ensure each entry in the JSON list represents a single matched data element from the two protocols.
+3. If no matching element is found in a protocol, leave it empty ("").
+4. **Do NOT include headers, explanations, or additional formatting**—only return the raw JSON list.
+5. It should include all the elements in the two protocols.
+6. If it cannot match the element, create the row and include the protocol it did find and put "could not match" in the other protocol column.
+7. protocol should be the between
+"""
+compare_export_prompt = ChatPromptTemplate.from_template(export_prompt)
+QUERY_PROMPT = """
+You are a helpful assistant. Use the available context to answer the question concisely and informatively.
+CONTEXT:
+{context}
+QUERY:
+{question}
+Provide a natural-language response using the given information. If you do not know the answer, say so.
+"""
+query_prompt = ChatPromptTemplate.from_template(QUERY_PROMPT)
+@tool
+def document_query_tool(question: str) -> str:
+    """Retrieves relevant document sections and answers questions based on the uploaded documents."""
+    retriever = cl.user_session.get("qdrant_retriever")
+    if not retriever:
+        return "Error: No documents available for retrieval. Please upload two PDF files first."
+    retriever = retriever.with_config({"k": 10})
+    # Use a RAG chain similar to the comparison tool
+    rag_chain = (
+        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
+        | query_prompt | llm | StrOutputParser()
+    )
+    response_text = rag_chain.invoke({"question": question})
+    # Get the retrieved docs for context
+    retrieved_docs = retriever.invoke(question)
+    return {
+        "messages": [HumanMessage(content=response_text)],
+        "context": retrieved_docs
+    }
+@tool
+def document_comparison_tool(question: str) -> str:
+    """Compares the two uploaded documents, identifies matched elements, exports them as JSON, formats into CSV, and provides a download link."""
+    # Retrieve the vector database retriever
+    retriever = cl.user_session.get("qdrant_retriever")
+    if not retriever:
+        return "Error: No documents available for retrieval. Please upload two PDF files first."
+    # Process query using RAG
+    rag_chain = (
+        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
+        | compare_export_prompt | llm | StrOutputParser()
+    )
+    response_text = rag_chain.invoke({"question": question})
+    # Parse response and save as CSV
+    try:
+        structured_data = json.loads(response_text)
+        if not structured_data:
+            return "Error: No matched elements found."
+        # Define output file path
+        file_path = os.path.join(OUTPUT_PATH, "comparison_results.csv")
+        # Save to CSV
+        df = pd.DataFrame(structured_data, columns=["Derived Description", "Protocol_1", "Protocol_2"])
+        df.to_csv(file_path, index=False)
+        # Send the message with the file directly from the tool
+        cl.run_sync(
+            cl.Message(
+                content="Comparison complete! Download the CSV below:",
+                elements=[cl.File(name="comparison_results.csv", path=file_path, display="inline")],
+            ).send()
+        )
+        # Return a simple confirmation message
+        return "Comparison results have been generated and displayed."
+    except json.JSONDecodeError:
+        return "Error: Response is not valid JSON."
+# Define tools for the agent
+tools = [document_query_tool, document_comparison_tool]
+# Set up the agent with a system prompt
+system_prompt = """You are an intelligent document analysis assistant. You have access to two tools:
+1. document_query_tool: Use this when a user wants information or has questions about the content of uploaded documents.
+2. document_comparison_tool: Use this when a user wants to compare elements between two uploaded documents or export comparison results.
+Analyze the user's request carefully to determine which tool is most appropriate.
+"""
+# Create the agent using OpenAI function calling
+agent_prompt = ChatPromptTemplate.from_messages([
+    ("system", system_prompt),
+    MessagesPlaceholder(variable_name="chat_history"),
+    ("human", "{input}"),
+    MessagesPlaceholder(variable_name="agent_scratchpad"),
+])
+agent = create_openai_tools_agent(
+    llm=ChatOpenAI(model="gpt-4o", temperature=0),
+    tools=tools,
+    prompt=agent_prompt
+)
+# Create the agent executor
+agent_executor = AgentExecutor.from_agent_and_tools(
+    agent=agent,
+    tools=tools,
+    verbose=True,
+    handle_parsing_errors=True,
+)
+def initialize_vector_store():
+    """Initialize an empty Qdrant vector store"""
+    try:
+        # Create a Qdrant client for in-memory storage
+        client = QdrantClient(location=":memory:")
+        # Create the collection with the appropriate vector size
+        # Snowflake/snowflake-arctic-embed-m produces 768-dimensional vectors
+        vector_size = 768  # Changed from 1536 to match your embedding model
+        # Check if collection exists, if not create it
+        collections = client.get_collections().collections
+        collection_names = [collection.name for collection in collections]
+        if "document_comparison" not in collection_names:
+            client.create_collection(
+                collection_name="document_comparison",
+                vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
+            )
+            print("Created new collection: document_comparison")
+        # Create the vector store with the client
+        vectorstore = Qdrant(
+            client=client,
+            collection_name="document_comparison",
+            embeddings=embedding_model
+        )
+        print("Vector store initialized successfully")
+        return vectorstore
+    except Exception as e:
+        print(f"Error initializing vector store: {str(e)}")
+        return None
+async def load_reference_data(vectorstore):
+    """Load reference Excel data into the vector database"""
+    if not os.path.exists(INITIAL_DATA_PATH):
+        print(f"Warning: Initial data file {INITIAL_DATA_PATH} not found")
+        return vectorstore
+    try:
+        # Load Excel file
+        df = pd.read_excel(INITIAL_DATA_PATH)
+        # Convert DataFrame to documents
+        documents = []
+        for _, row in df.iterrows():
+            # Combine all columns into a single text
+            content = " ".join([f"{col}: {str(val)}" for col, val in row.items()])
+            doc = Document(page_content=content, metadata={"source": "Instruments_Definitions.xlsx"})
+            documents.append(doc)
+        # Add documents to vector store
+        if documents:
+            vectorstore.add_documents(documents)
+            print(f"Successfully loaded {len(documents)} entries from {INITIAL_DATA_PATH}")
+        return vectorstore
+    except Exception as e:
+        print(f"Error loading reference data: {str(e)}")
+        return vectorstore
+async def process_uploaded_files(files, vectorstore):
+    """Process uploaded PDF files and add them to the vector store"""
+    documents_with_metadata = []
+    for file in files:
+        file_path = os.path.join(UPLOAD_PATH, file.name)
+        shutil.copyfile(file.path, file_path)
+        loader = PyMuPDFLoader(file_path)
+        documents = loader.load()
+        for doc in documents:
+            source_name = file.name
+            chunks = semantic_splitter.split_text(doc.page_content)
+            for chunk in chunks:
+                doc_chunk = Document(page_content=chunk, metadata={"source": source_name})
+                documents_with_metadata.append(doc_chunk)
+    if documents_with_metadata:
+        # Add documents to vector store
+        vectorstore.add_documents(documents_with_metadata)
+        print(f"Added {len(documents_with_metadata)} chunks from uploaded files")
+        return True
+    return False
+@cl.on_chat_start
+async def start():
+    # Initialize chat history for the agent
+    cl.user_session.set("chat_history", [])
+    # Initialize vector store
+    vectorstore = initialize_vector_store()
+    if not vectorstore:
+        await cl.Message("Error: Could not initialize vector store.").send()
+        return
+    # Load reference data
+    with cl.Step("Loading reference data"):
+        vectorstore = await load_reference_data(vectorstore)
+        cl.user_session.set("qdrant_vectorstore", vectorstore)
+        cl.user_session.set("qdrant_retriever", vectorstore.as_retriever())
+        await cl.Message("Reference data loaded successfully!").send()
+    # Ask for PDF uploads
+    files = await cl.AskFileMessage(
+        content="Please upload **two PDF files** for comparison:",
+        accept=["application/pdf"],
+        max_files=2
+    ).send()
+    if len(files) != 2:
+        await cl.Message("Error: You must upload exactly two PDF files.").send()
+        return
+    # Process uploaded files
+    with cl.Step("Processing uploaded files"):
+        success = await process_uploaded_files(files, vectorstore)
+        if success:
+            # Update the retriever with the latest vector store
+            cl.user_session.set("qdrant_retriever", vectorstore.as_retriever())
+            await cl.Message("Files uploaded and processed successfully! You can now enter your query.").send()
+        else:
+            await cl.Message("Error: Unable to process files. Please try again.").send()
+@cl.on_message
+async def handle_message(message: cl.Message):
+    # Get chat history
+    chat_history = cl.user_session.get("chat_history", [])
+    # Run the agent
+    with cl.Step("Agent thinking"):
+        response = await cl.make_async(agent_executor.invoke)(
+            {"input": message.content, "chat_history": chat_history}
+        )
+    # Handle the response based on the tool that was called
+    if isinstance(response["output"], dict) and "messages" in response["output"]:
+        # This is from document_query_tool
+        await cl.Message(response["output"]["messages"][0].content).send()
+    else:
+        # Generic response (including the confirmation from document_comparison_tool)
+        await cl.Message(content=str(response["output"])).send()
+    # Update chat history with the new exchange
+    chat_history.extend([
+        HumanMessage(content=message.content),
+        HumanMessage(content=str(response["output"]))
+    ])
+    cl.user_session.set("chat_history", chat_history)

chainlit.md ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Welcome to Chat with Your Text File
2	+ With this application, you can compare uploaded protocol files

data/Instruments_Definitions.xlsx ADDED Viewed

Binary file (10 kB). View file

example_files/Instruments_Definitions.xlsx ADDED Viewed

Binary file (10 kB). View file

example_files/docx/Protocol_NOAPS v1.0.docx ADDED Viewed

Binary file (20.8 kB). View file

example_files/docx/Protocol_PKAS v1.0.docx ADDED Viewed

Binary file (26.2 kB). View file

example_files/docx/Protocol_PPMT v1.0.docx ADDED Viewed

Binary file (20.5 kB). View file

example_files/pdf/Protocol_NOAPS v1.0.pdf ADDED Viewed

Binary file (75 kB). View file

example_files/pdf/Protocol_PKAS v1.0.pdf ADDED Viewed

Binary file (140 kB). View file

example_files/pdf/Protocol_PPMT v1.0.pdf ADDED Viewed

Binary file (48 kB). View file

pyproject.toml ADDED Viewed

	@@ -0,0 +1,38 @@

+[project]
+name = "protocol-sync"
+version = "0.1.0"
+description = "midterm POC huggingface project"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+"IProgress",
+"PyMuPDF",
+"accelerate",
+"chainlit",
+"huggingface_hub",
+"ipykernel",
+"ipywidgets",
+"langchain",
+"langchain-community",
+"langchain-core",
+"langchain-experimental",
+"langchain-huggingface",
+"langchain-openai",
+"langchain-qdrant",
+"langchain-text-splitters",
+"langgraph",
+"langsmith",
+"lxml",
+"openai",
+"pymupdf",
+"pypdf2",
+"qdrant-client",
+"ragas",
+"torch",
+"transformers",
+"tqdm",
+"unstructured",
+"wandb",
+"websockets",
+"openpyxl",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,210 @@

+accelerate==1.4.0
+aiofiles==23.2.1
+aiohappyeyeballs==2.4.6
+aiohttp==3.11.13
+aiosignal==1.3.2
+annotated-types==0.7.0
+anyio==4.8.0
+appdirs==1.4.4
+asttokens==3.0.0
+asyncer==0.0.7
+attrs==25.1.0
+backoff==2.2.1
+beautifulsoup4==4.13.3
+bidict==0.23.1
+certifi==2025.1.31
+cffi==1.17.1
+chainlit==2.2.1
+chardet==5.2.0
+charset-normalizer==3.4.1
+chevron==0.14.0
+click==8.1.8
+comm==0.2.2
+cryptography==44.0.1
+dataclasses-json==0.6.7
+datasets==3.3.2
+debugpy==1.8.12
+decorator==5.2.1
+deepdiff==8.2.0
+deprecated==1.2.18
+dill==0.3.8
+diskcache==5.6.3
+distro==1.9.0
+docker-pycreds==0.4.0
+emoji==2.14.1
+executing==2.2.0
+fastapi==0.115.8
+filelock==3.17.0
+filetype==1.2.0
+frozenlist==1.5.0
+fsspec==2024.12.0
+gitdb==4.0.12
+gitpython==3.1.44
+googleapis-common-protos==1.68.0
+greenlet==3.1.1
+grpcio==1.70.0
+grpcio-tools==1.70.0
+h11==0.14.0
+h2==4.2.0
+hpack==4.1.0
+httpcore==1.0.7
+httpx==0.28.1
+httpx-sse==0.4.0
+huggingface-hub==0.29.1
+hyperframe==6.1.0
+idna==3.10
+importlib-metadata==8.5.0
+iprogress==0.4
+ipykernel==6.29.5
+ipython==8.32.0
+ipywidgets==8.1.5
+jedi==0.19.2
+jinja2==3.1.5
+jiter==0.8.2
+joblib==1.4.2
+jsonpatch==1.33
+jsonpath-python==1.0.6
+jsonpointer==3.0.0
+jupyter-client==8.6.3
+jupyter-core==5.7.2
+jupyterlab-widgets==3.0.13
+langchain==0.3.15
+langchain-community==0.3.15
+langchain-core==0.3.31
+langchain-experimental==0.3.4
+langchain-huggingface==0.1.2
+langchain-openai==0.3.1
+langchain-qdrant==0.2.0
+langchain-text-splitters==0.3.5
+langdetect==1.0.9
+langgraph==0.2.74
+langgraph-checkpoint==2.0.16
+langgraph-sdk==0.1.53
+langsmith==0.3.10
+lazify==0.4.0
+literalai==0.1.103
+lxml==5.3.1
+markupsafe==3.0.2
+marshmallow==3.26.1
+matplotlib-inline==0.1.7
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.1.0
+multiprocess==0.70.16
+mypy-extensions==1.0.0
+nest-asyncio==1.6.0
+networkx==3.4.2
+nltk==3.9.1
+numpy==2.2.3
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusparselt-cu12==0.6.2
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+openai==1.64.0
+opentelemetry-api==1.29.0
+opentelemetry-exporter-otlp==1.29.0
+opentelemetry-exporter-otlp-proto-common==1.29.0
+opentelemetry-exporter-otlp-proto-grpc==1.29.0
+opentelemetry-exporter-otlp-proto-http==1.29.0
+opentelemetry-instrumentation==0.50b0
+opentelemetry-proto==1.29.0
+opentelemetry-sdk==1.29.0
+opentelemetry-semantic-conventions==0.50b0
+orderly-set==5.3.0
+orjson==3.10.15
+packaging==24.2
+pandas==2.2.3
+parso==0.8.4
+pexpect==4.9.0
+pillow==11.1.0
+platformdirs==4.3.6
+portalocker==2.10.1
+prompt-toolkit==3.0.50
+propcache==0.3.0
+protobuf==5.29.3
+psutil==7.0.0
+ptyprocess==0.7.0
+pure-eval==0.2.3
+pyarrow==19.0.1
+pycparser==2.22
+pydantic==2.10.6
+pydantic-core==2.27.2
+pydantic-settings==2.8.0
+pygments==2.19.1
+pyjwt==2.10.1
+pymupdf==1.25.3
+pypdf==5.3.0
+pypdf2==3.0.1
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-engineio==4.11.2
+python-iso639==2025.2.18
+python-magic==0.4.27
+python-multipart==0.0.18
+python-socketio==5.12.1
+pytz==2025.1
+pyyaml==6.0.2
+pyzmq==26.2.1
+qdrant-client==1.13.2
+ragas==0.2.13
+rapidfuzz==3.12.1
+regex==2024.11.6
+requests==2.32.3
+requests-toolbelt==1.0.0
+safetensors==0.5.2
+scikit-learn==1.6.1
+scipy==1.15.2
+sentence-transformers==3.4.1
+sentry-sdk==2.22.0
+setproctitle==1.3.5
+setuptools==75.8.0
+simple-websocket==1.1.0
+six==1.17.0
+smmap==5.0.2
+sniffio==1.3.1
+soupsieve==2.6
+sqlalchemy==2.0.38
+stack-data==0.6.3
+starlette==0.41.3
+sympy==1.13.1
+syncer==2.0.3
+tabulate==0.9.0
+tenacity==9.0.0
+threadpoolctl==3.5.0
+tiktoken==0.9.0
+tokenizers==0.21.0
+tomli==2.2.1
+torch==2.6.0
+tornado==6.4.2
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.49.0
+triton==3.2.0
+typing-extensions==4.12.2
+typing-inspect==0.9.0
+tzdata==2025.1
+unstructured==0.14.8
+unstructured-client==0.25.9
+uptrace==1.29.0
+urllib3==2.3.0
+uvicorn==0.34.0
+wandb==0.19.7
+watchfiles==0.20.0
+wcwidth==0.2.13
+websockets==15.0
+widgetsnbextension==4.0.13
+wrapt==1.17.2
+wsproto==1.2.0
+xxhash==3.5.0
+yarl==1.18.3
+zipp==3.21.0
+zstandard==0.23.0

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff