MOHITRAJDEO12345 commited on
Commit
b3f1583
Β·
0 Parent(s):

Fresh start: Clean repository without binary files

Browse files
Files changed (9) hide show
  1. .gitattributes +35 -0
  2. .gitignore +164 -0
  3. .streamlit/config.toml +2 -0
  4. Dockerfile +21 -0
  5. README.md +20 -0
  6. requirements.txt +9 -0
  7. src/ingestor.py +102 -0
  8. src/pipeline.py +124 -0
  9. src/streamlit_app.py +126 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
130
+
131
+ # ChromaDB vector database
132
+ data/
133
+ chroma/
134
+ *.db
135
+ *.sqlite3
136
+
137
+ # Streamlit
138
+ .streamlit/secrets.toml
139
+ .streamlit/config.toml.backup
140
+
141
+ # IDEs
142
+ .vscode/
143
+ .idea/
144
+ *.swp
145
+ *.swo
146
+ *~
147
+
148
+ # OS generated files
149
+ .DS_Store
150
+ .DS_Store?
151
+ ._*
152
+ .Spotlight-V100
153
+ .Trashes
154
+ ehthumbs.db
155
+ Thumbs.db
156
+
157
+ # Logs
158
+ *.log
159
+ logs/
160
+
161
+ # Temporary files
162
+ *.tmp
163
+ *.temp
164
+ .cache/
.streamlit/config.toml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Streamlit configuration
2
+ # Data directory will be set automatically
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.13.5-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y \
6
+ build-essential \
7
+ curl \
8
+ git \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ COPY requirements.txt ./
12
+ COPY src/ ./src/
13
+ COPY .streamlit/ .streamlit/
14
+
15
+ RUN pip3 install -r requirements.txt
16
+
17
+ EXPOSE 8501
18
+
19
+ HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
20
+
21
+ ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: DocuMind
3
+ emoji: πŸš€
4
+ colorFrom: red
5
+ colorTo: red
6
+ sdk: docker
7
+ app_port: 8501
8
+ tags:
9
+ - streamlit
10
+ pinned: false
11
+ short_description: The DocuMind system, as outlined and implemented in this rep
12
+ license: mit
13
+ ---
14
+
15
+ # Welcome to Streamlit!
16
+
17
+ Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
18
+
19
+ If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
20
+ forums](https://discuss.streamlit.io).
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ langchain-core
3
+ langchain
4
+ langchain-community
5
+ langchain-google-genai
6
+ chromadb
7
+ pypdf
8
+ pymupdf
9
+ python-dotenv
src/ingestor.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ import fitz # PyMuPDF
4
+ from typing import List
5
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
6
+ import asyncio
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain_community.vectorstores import Chroma
9
+ from langchain_core.documents import Document
10
+ import hashlib
11
+ import json
12
+
13
+ class Ingestor:
14
+ def __init__(self, api_key: str):
15
+ self.api_key = api_key
16
+ # Ensure an event loop is available for GoogleGenerativeAIEmbeddings
17
+ try:
18
+ asyncio.get_running_loop()
19
+ except RuntimeError:
20
+ asyncio.set_event_loop(asyncio.new_event_loop())
21
+
22
+ # Initialize the embedding model
23
+ self.embeddings = GoogleGenerativeAIEmbeddings(
24
+ model="models/embedding-001",
25
+ google_api_key=self.api_key,
26
+ )
27
+
28
+ def load_and_chunk_pdfs(self, file_paths: List[str]) -> List:
29
+ """Loads PDFs and splits them into chunks with metadata."""
30
+ all_chunks = []
31
+ text_splitter = RecursiveCharacterTextSplitter(
32
+ chunk_size=1000,
33
+ chunk_overlap=100,
34
+ separators=["\n\n", "\n", " ", ""],
35
+ length_function=len
36
+ )
37
+
38
+ for file_path in file_paths:
39
+ try:
40
+ # Use PyMuPDF to open and extract text from the PDF
41
+ doc = fitz.open(file_path)
42
+
43
+ # Extract text page by page with metadata
44
+ for page_num, page in enumerate(doc):
45
+ text = page.get_text()
46
+
47
+ # Create LangChain Document object with metadata
48
+ langchain_doc = Document(
49
+ page_content=text,
50
+ metadata={
51
+ "source": os.path.basename(file_path),
52
+ "page": page_num + 1,
53
+ }
54
+ )
55
+
56
+ # Split the page text into chunks
57
+ chunks = text_splitter.split_documents([langchain_doc])
58
+ all_chunks.extend(chunks)
59
+
60
+ doc.close()
61
+
62
+ except Exception as e:
63
+ print(f"Error processing {file_path}: {e}")
64
+
65
+ return all_chunks
66
+
67
+ def ingest_documents(self, file_paths: List[str]):
68
+ """Ingests documents, creates embeddings, and initializes a ChromaDB vector store."""
69
+
70
+ # Check if vector store cache exists, and load if it does
71
+ # The cache key is a hash of the file paths, ensuring it's unique per set of docs
72
+ cache_key = hashlib.sha256(json.dumps(sorted(file_paths)).encode()).hexdigest()
73
+
74
+ # Using a fixed directory for persistence
75
+ persist_directory = "./data/db"
76
+
77
+ # Check if the vector store has been created and cached before
78
+ if os.path.exists(persist_directory):
79
+ print("Loading existing vector store from cache...")
80
+ vector_store = Chroma(
81
+ persist_directory=persist_directory,
82
+ embedding_function=self.embeddings,
83
+ )
84
+ # A simple check to ensure the vector store is not empty
85
+ if vector_store.get()['documents']:
86
+ return vector_store
87
+
88
+ print("Creating new vector store from documents...")
89
+ # Load and chunk documents
90
+ chunks = self.load_and_chunk_pdfs(file_paths)
91
+ if not chunks:
92
+ raise ValueError("No valid document chunks could be created.")
93
+
94
+ # Create the ChromaDB vector store from the chunks and embeddings
95
+ vector_store = Chroma.from_documents(
96
+ documents=chunks,
97
+ embedding=self.embeddings,
98
+ persist_directory=persist_directory,
99
+ )
100
+ # Persist the vector store to disk
101
+ vector_store.persist()
102
+ return vector_store
src/pipeline.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_google_genai import ChatGoogleGenerativeAI
2
+ from langchain.chains import RetrievalQAWithSourcesChain
3
+ from langchain_community.vectorstores import Chroma
4
+ from langchain_core.prompts import PromptTemplate
5
+ from langchain_core.documents import Document
6
+ from typing import List
7
+
8
+ class RAGPipeline:
9
+ def __init__(self, vector_store: Chroma, api_key: str):
10
+ self.vector_store = vector_store
11
+ self.llm = ChatGoogleGenerativeAI(
12
+ model="gemini-2.0-flash",
13
+ google_api_key=api_key,
14
+ temperature=0.2,
15
+ )
16
+ self.retriever = self.vector_store.as_retriever(
17
+ search_type="similarity",
18
+ search_kwargs={"k": 5}
19
+ )
20
+
21
+ # Define the prompt template for the LLM
22
+ # This template instructs the model to answer based on the provided context
23
+ # and to include source citations.
24
+ template = """
25
+ You are a helpful assistant. Use the following context to answer the question at the end.
26
+ If you don't know the answer, just say that you don't know, don't try to make up an answer.
27
+
28
+ Context:
29
+ {context}
30
+
31
+ Question:
32
+ {question}
33
+
34
+ Instructions:
35
+ 1. Provide a detailed and accurate answer based ONLY on the provided context.
36
+ 2. When referencing information, mention which source and page it comes from.
37
+ 3. If the context doesn't contain enough information, say so clearly.
38
+ 4. Keep your answer concise but comprehensive.
39
+
40
+ Answer:
41
+ """
42
+ self.prompt = PromptTemplate(
43
+ template=template,
44
+ input_variables=["context", "question"]
45
+ )
46
+
47
+ def format_documents_with_citations(self, documents: List) -> str:
48
+ """
49
+ Formats the retrieved documents into a single string, including metadata for citations.
50
+ """
51
+ formatted_text = []
52
+ for i, doc in enumerate(documents, 1):
53
+ content = doc.page_content
54
+ source = doc.metadata.get("source", "unknown")
55
+ page = doc.metadata.get("page", "unknown")
56
+ formatted_text.append(f"Source {i}:\nFile: {source}\nPage: {page}\nContent:\n{content}\n")
57
+ return "\n---\n".join(formatted_text)
58
+
59
+ def get_source_info_with_scores(self, documents: List) -> str:
60
+ """
61
+ Gets source information with confidence scores for the retrieved documents.
62
+ """
63
+ source_info = []
64
+ for i, doc in enumerate(documents, 1):
65
+ source = doc.metadata.get("source", "unknown")
66
+ page = doc.metadata.get("page", "unknown")
67
+
68
+ # Calculate confidence score based on multiple factors:
69
+ # 1. Retrieval order (higher for top results)
70
+ # 2. Content length (longer content might be more relevant)
71
+ # 3. Position in document (earlier pages might be more important)
72
+ base_score = 1.0 - (i - 1) * 0.15 # Order factor
73
+ length_factor = min(1.0, len(doc.page_content) / 1000) # Length factor
74
+ page_factor = max(0.8, 1.0 - (page - 1) * 0.05) if isinstance(page, int) else 1.0
75
+
76
+ confidence_score = base_score * length_factor * page_factor
77
+ confidence_score = max(0.1, min(1.0, confidence_score)) # Clamp between 0.1 and 1.0
78
+ confidence_percent = int(confidence_score * 100)
79
+
80
+ # Determine confidence level
81
+ if confidence_percent >= 90:
82
+ level = "Very High"
83
+ elif confidence_percent >= 75:
84
+ level = "High"
85
+ elif confidence_percent >= 60:
86
+ level = "Medium"
87
+ elif confidence_percent >= 40:
88
+ level = "Low"
89
+ else:
90
+ level = "Very Low"
91
+
92
+ source_info.append(f"β€’ **Source {i}**: {source}")
93
+ source_info.append(f" - **Page**: {page}")
94
+ source_info.append(f" - **Confidence**: {confidence_percent}% ({level})")
95
+ source_info.append(f" - **Content Preview**: {doc.page_content[:200]}...")
96
+
97
+ return "\n".join(source_info)
98
+
99
+ def answer_question(self, question: str) -> str:
100
+ """
101
+ Executes the RAG pipeline: retrieves documents and generates a response.
102
+ """
103
+ # Step 1: Retrieve relevant documents with scores
104
+ retrieved_docs = self.retriever.get_relevant_documents(question)
105
+
106
+ if not retrieved_docs:
107
+ return "I am sorry, I could not find any relevant information in the documents to answer your question."
108
+
109
+ # Step 2: Format the retrieved documents for the prompt
110
+ formatted_context = self.format_documents_with_citations(retrieved_docs)
111
+
112
+ # Step 3: Create the final prompt
113
+ final_prompt = self.prompt.format(context=formatted_context, question=question)
114
+
115
+ # Step 4: Call the LLM to generate the answer
116
+ response = self.llm.invoke(final_prompt).content
117
+
118
+ # Step 5: Add source information and confidence scores to the response
119
+ source_info = self.get_source_info_with_scores(retrieved_docs)
120
+
121
+ # Combine the response with source information
122
+ full_response = f"{response}\n\n**Sources and Context:**\n{source_info}"
123
+
124
+ return full_response
src/streamlit_app.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import altair as alt
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+ import streamlit as st
7
+
8
+ from dotenv import load_dotenv
9
+ import os
10
+ from ingestor import Ingestor
11
+ from pipeline import RAGPipeline
12
+ import tempfile
13
+
14
+ # Set the event loop policy for Windows (if available)
15
+ try:
16
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
17
+ except AttributeError:
18
+ # WindowsSelectorEventLoopPolicy not available, use default
19
+ pass
20
+
21
+ # 1. Set up the Streamlit page configuration and title
22
+ st.set_page_config(page_title="πŸ“„ DocuMind: Your Document AI", page_icon="πŸ“„")
23
+ st.title("πŸ“„ DocuMind: Document QA with Gemini")
24
+
25
+ # 2. Add a sidebar for API key and instructions
26
+ with st.sidebar:
27
+ st.header("Configuration")
28
+ st.info("To get started, please upload your PDF document(s).")
29
+ gemini_api_key = st.text_input("Gemini API Key", type="password")
30
+
31
+ # Check for API key and load from.env if available
32
+ if not gemini_api_key:
33
+ load_dotenv()
34
+ gemini_api_key = os.getenv("GEMINI_API_KEY")
35
+
36
+ if not gemini_api_key:
37
+ st.warning("Please enter a valid Gemini API key!")
38
+ st.stop()
39
+
40
+ # Store API key in session state for reuse
41
+ st.session_state["gemini_api_key"] = gemini_api_key
42
+
43
+ # 3. Handle file uploads
44
+ uploaded_files = st.file_uploader(
45
+ "Upload your PDF documents",
46
+ type="pdf",
47
+ accept_multiple_files=True,
48
+ )
49
+
50
+ # Use st.session_state to handle RAG state persistence across reruns
51
+ if "rag_pipeline" not in st.session_state:
52
+ st.session_state["rag_pipeline"] = None
53
+ st.session_state["ingested_docs"] = []
54
+
55
+ # 4. Ingest documents and set up the RAG pipeline
56
+ if uploaded_files and st.session_state["rag_pipeline"] is None:
57
+ with st.spinner("Processing documents... This may take a moment."):
58
+ # Create a temporary directory to save uploaded files
59
+ with tempfile.TemporaryDirectory() as temp_dir:
60
+ file_paths = []
61
+ for uploaded_file in uploaded_files:
62
+ file_path = os.path.join(temp_dir, uploaded_file.name)
63
+ with open(file_path, "wb") as f:
64
+ f.write(uploaded_file.getbuffer())
65
+ file_paths.append(file_path)
66
+
67
+ try:
68
+ # Ingest documents and create the ChromaDB vector store
69
+ ingestor = Ingestor(api_key=gemini_api_key)
70
+ vector_store = ingestor.ingest_documents(file_paths)
71
+
72
+ # Initialize the RAG pipeline with the vector store
73
+ st.session_state["rag_pipeline"] = RAGPipeline(
74
+ vector_store=vector_store,
75
+ api_key=gemini_api_key,
76
+ )
77
+
78
+ # Store the names of the ingested documents for display
79
+ st.session_state["ingested_docs"] = [f.name for f in uploaded_files]
80
+
81
+ st.success("Documents processed successfully!")
82
+
83
+ except Exception as e:
84
+ st.error(f"An error occurred during document ingestion: {e}")
85
+ st.session_state["rag_pipeline"] = None
86
+
87
+ # 5. Display a list of ingested documents
88
+ if st.session_state["ingested_docs"]:
89
+ with st.expander("Documents in Knowledge Base"):
90
+ st.write("The following documents have been successfully ingested:")
91
+ for doc_name in st.session_state["ingested_docs"]:
92
+ st.markdown(f"- {doc_name}")
93
+
94
+ # 6. Set up the chat interface
95
+ if "messages" not in st.session_state:
96
+ st.session_state.messages = []
97
+
98
+ # Display chat messages from history
99
+ for message in st.session_state.messages:
100
+ with st.chat_message(message["role"]):
101
+ st.markdown(message["content"])
102
+
103
+ # Process user question if RAG pipeline is ready
104
+ if st.session_state["rag_pipeline"]:
105
+ question = st.chat_input("Ask a question about the documents...")
106
+
107
+ if question:
108
+ # Display user message
109
+ st.session_message = st.chat_message("user")
110
+ st.session_message.markdown(question)
111
+ st.session_state.messages.append({"role": "user", "content": question})
112
+
113
+ with st.chat_message("assistant"):
114
+ with st.spinner("Thinking..."):
115
+ try:
116
+ # Get the answer from the RAG pipeline
117
+ response = st.session_state["rag_pipeline"].answer_question(question)
118
+
119
+ # Display the response using st.markdown
120
+ st.markdown(response)
121
+
122
+ # Add assistant response to chat history
123
+ st.session_state.messages.append({"role": "assistant", "content": response})
124
+
125
+ except Exception as e:
126
+ st.error(f"An error occurred during response generation: {e}")