Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,10 +1,4 @@
|
|
| 1 |
-
|
| 2 |
-
Streamlit application for PDF-based Retrieval-Augmented Generation (RAG) using Ollama + LangChain.
|
| 3 |
-
|
| 4 |
-
This application allows users to upload a PDF, process it,
|
| 5 |
-
and then ask questions about the content using a selected language model.
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
import streamlit as st
|
| 9 |
import logging
|
| 10 |
import os
|
|
@@ -46,15 +40,7 @@ logger = logging.getLogger(__name__)
|
|
| 46 |
def extract_model_names(
|
| 47 |
models_info: Dict[str, List[Dict[str, Any]]],
|
| 48 |
) -> Tuple[str, ...]:
|
| 49 |
-
"""
|
| 50 |
-
Extract model names from the provided models information.
|
| 51 |
-
|
| 52 |
-
Args:
|
| 53 |
-
models_info (Dict[str, List[Dict[str, Any]]]): Dictionary containing information about available models.
|
| 54 |
-
|
| 55 |
-
Returns:
|
| 56 |
-
Tuple[str, ...]: A tuple of model names.
|
| 57 |
-
"""
|
| 58 |
logger.info("Extracting model names from models_info")
|
| 59 |
model_names = tuple(model["name"] for model in models_info["models"])
|
| 60 |
logger.info(f"Extracted model names: {model_names}")
|
|
@@ -62,15 +48,7 @@ def extract_model_names(
|
|
| 62 |
|
| 63 |
|
| 64 |
def create_vector_db(file_upload) -> Chroma:
|
| 65 |
-
"""
|
| 66 |
-
Create a vector database from an uploaded PDF file.
|
| 67 |
-
|
| 68 |
-
Args:
|
| 69 |
-
file_upload (st.UploadedFile): Streamlit file upload object containing the PDF.
|
| 70 |
-
|
| 71 |
-
Returns:
|
| 72 |
-
Chroma: A vector store containing the processed document chunks.
|
| 73 |
-
"""
|
| 74 |
logger.info(f"Creating vector DB from file upload: {file_upload.name}")
|
| 75 |
temp_dir = tempfile.mkdtemp()
|
| 76 |
|
|
@@ -97,19 +75,8 @@ def create_vector_db(file_upload) -> Chroma:
|
|
| 97 |
|
| 98 |
|
| 99 |
def process_question(question: str, vector_db: Chroma, selected_model: str) -> str:
|
| 100 |
-
"""
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
Args:
|
| 104 |
-
question (str): The user's question.
|
| 105 |
-
vector_db (Chroma): The vector database containing document embeddings.
|
| 106 |
-
selected_model (str): The name of the selected language model.
|
| 107 |
-
|
| 108 |
-
Returns:
|
| 109 |
-
str: The generated response to the user's question.
|
| 110 |
-
"""
|
| 111 |
-
logger.info(f"""Processing question: {
|
| 112 |
-
question} using model: {selected_model}""")
|
| 113 |
llm = ChatOllama(model=selected_model, temperature=0)
|
| 114 |
QUERY_PROMPT = PromptTemplate(
|
| 115 |
input_variables=["question"],
|
|
@@ -149,17 +116,8 @@ def process_question(question: str, vector_db: Chroma, selected_model: str) -> s
|
|
| 149 |
|
| 150 |
@st.cache_data
|
| 151 |
def extract_all_pages_as_images(file_upload) -> List[Any]:
|
| 152 |
-
"""
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
Args:
|
| 156 |
-
file_upload (st.UploadedFile): Streamlit file upload object containing the PDF.
|
| 157 |
-
|
| 158 |
-
Returns:
|
| 159 |
-
List[Any]: A list of image objects representing each page of the PDF.
|
| 160 |
-
"""
|
| 161 |
-
logger.info(f"""Extracting all pages as images from file: {
|
| 162 |
-
file_upload.name}""")
|
| 163 |
pdf_pages = []
|
| 164 |
with pdfplumber.open(file_upload) as pdf:
|
| 165 |
pdf_pages = [page.to_image().original for page in pdf.pages]
|
|
@@ -168,12 +126,7 @@ def extract_all_pages_as_images(file_upload) -> List[Any]:
|
|
| 168 |
|
| 169 |
|
| 170 |
def delete_vector_db(vector_db: Optional[Chroma]) -> None:
|
| 171 |
-
"""
|
| 172 |
-
Delete the vector database and clear related session state.
|
| 173 |
-
|
| 174 |
-
Args:
|
| 175 |
-
vector_db (Optional[Chroma]): The vector database to be deleted.
|
| 176 |
-
"""
|
| 177 |
logger.info("Deleting vector DB")
|
| 178 |
if vector_db is not None:
|
| 179 |
vector_db.delete_collection()
|
|
@@ -189,12 +142,7 @@ def delete_vector_db(vector_db: Optional[Chroma]) -> None:
|
|
| 189 |
|
| 190 |
|
| 191 |
def main() -> None:
|
| 192 |
-
"""
|
| 193 |
-
Main function to run the Streamlit application.
|
| 194 |
-
|
| 195 |
-
This function sets up the user interface, handles file uploads,
|
| 196 |
-
processes user queries, and displays results.
|
| 197 |
-
"""
|
| 198 |
st.subheader("🧠 Ollama PDF RAG playground", divider="gray", anchor=False)
|
| 199 |
|
| 200 |
models_info = ollama.list()
|
|
@@ -246,33 +194,4 @@ def main() -> None:
|
|
| 246 |
with message_container.chat_message(message["role"], avatar=avatar):
|
| 247 |
st.markdown(message["content"])
|
| 248 |
|
| 249 |
-
|
| 250 |
-
try:
|
| 251 |
-
st.session_state["messages"].append({"role": "user", "content": prompt})
|
| 252 |
-
message_container.chat_message("user", avatar="😎").markdown(prompt)
|
| 253 |
-
|
| 254 |
-
with message_container.chat_message("assistant", avatar="🤖"):
|
| 255 |
-
with st.spinner(":green[processing...]"):
|
| 256 |
-
if st.session_state["vector_db"] is not None:
|
| 257 |
-
response = process_question(
|
| 258 |
-
prompt, st.session_state["vector_db"], selected_model
|
| 259 |
-
)
|
| 260 |
-
st.markdown(response)
|
| 261 |
-
else:
|
| 262 |
-
st.warning("Please upload a PDF file first.")
|
| 263 |
-
|
| 264 |
-
if st.session_state["vector_db"] is not None:
|
| 265 |
-
st.session_state["messages"].append(
|
| 266 |
-
{"role": "assistant", "content": response}
|
| 267 |
-
)
|
| 268 |
-
|
| 269 |
-
except Exception as e:
|
| 270 |
-
st.error(e, icon="⛔️")
|
| 271 |
-
logger.error(f"Error processing prompt: {e}")
|
| 272 |
-
else:
|
| 273 |
-
if st.session_state["vector_db"] is None:
|
| 274 |
-
st.warning("Upload a PDF file to begin chat...")
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
if __name__ == "__main__":
|
| 278 |
-
main()
|
|
|
|
| 1 |
+
!pip install langchain-community # Install the missing module
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import streamlit as st
|
| 3 |
import logging
|
| 4 |
import os
|
|
|
|
| 40 |
def extract_model_names(
|
| 41 |
models_info: Dict[str, List[Dict[str, Any]]],
|
| 42 |
) -> Tuple[str, ...]:
|
| 43 |
+
"""Extract model names from the provided models information."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
logger.info("Extracting model names from models_info")
|
| 45 |
model_names = tuple(model["name"] for model in models_info["models"])
|
| 46 |
logger.info(f"Extracted model names: {model_names}")
|
|
|
|
| 48 |
|
| 49 |
|
| 50 |
def create_vector_db(file_upload) -> Chroma:
|
| 51 |
+
"""Create a vector database from an uploaded PDF file."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
logger.info(f"Creating vector DB from file upload: {file_upload.name}")
|
| 53 |
temp_dir = tempfile.mkdtemp()
|
| 54 |
|
|
|
|
| 75 |
|
| 76 |
|
| 77 |
def process_question(question: str, vector_db: Chroma, selected_model: str) -> str:
|
| 78 |
+
"""Process a user question using the vector database and selected language model."""
|
| 79 |
+
logger.info(f"Processing question: {question} using model: {selected_model}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
llm = ChatOllama(model=selected_model, temperature=0)
|
| 81 |
QUERY_PROMPT = PromptTemplate(
|
| 82 |
input_variables=["question"],
|
|
|
|
| 116 |
|
| 117 |
@st.cache_data
|
| 118 |
def extract_all_pages_as_images(file_upload) -> List[Any]:
|
| 119 |
+
"""Extract all pages from a PDF file as images."""
|
| 120 |
+
logger.info(f"Extracting all pages as images from file: {file_upload.name}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
pdf_pages = []
|
| 122 |
with pdfplumber.open(file_upload) as pdf:
|
| 123 |
pdf_pages = [page.to_image().original for page in pdf.pages]
|
|
|
|
| 126 |
|
| 127 |
|
| 128 |
def delete_vector_db(vector_db: Optional[Chroma]) -> None:
|
| 129 |
+
"""Delete the vector database and clear related session state."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
logger.info("Deleting vector DB")
|
| 131 |
if vector_db is not None:
|
| 132 |
vector_db.delete_collection()
|
|
|
|
| 142 |
|
| 143 |
|
| 144 |
def main() -> None:
|
| 145 |
+
"""Main function to run the Streamlit application."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
st.subheader("🧠 Ollama PDF RAG playground", divider="gray", anchor=False)
|
| 147 |
|
| 148 |
models_info = ollama.list()
|
|
|
|
| 194 |
with message_container.chat_message(message["role"], avatar=avatar):
|
| 195 |
st.markdown(message["content"])
|
| 196 |
|
| 197 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|