Spaces:
Build error
Build error
| """ | |
| RAG Application - Streamlit Web App | |
| This is the main application file for deploying the RAG application to Streamlit Cloud or Hugging Face Spaces. | |
| To run locally: | |
| streamlit run app.py | |
| To deploy to Hugging Face Spaces: | |
| 1. Create a new Space on Hugging Face (https://huggingface.co/new-space) | |
| 2. Select "Docker" as the SDK | |
| 3. Push this file and requirements.txt to the repository | |
| 4. The app will automatically deploy | |
| """ | |
| import os | |
| from pathlib import Path | |
| from dotenv import load_dotenv | |
| import streamlit as st | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_openai import OpenAIEmbeddings, ChatOpenAI | |
| from langchain_community.vectorstores import Chroma | |
| from langchain.chains.retrieval import create_retrieval_chain | |
| from langchain.chains.combine_documents import create_stuff_documents_chain | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from langchain_community.document_loaders import PyPDFLoader | |
| # ============================================================================ | |
| # CORE RAG FUNCTIONS | |
| # ============================================================================ | |
| def load_and_process_documents(file_path): | |
| """ | |
| Loads a PDF document, splits into chunks and creates embeddings. | |
| Args: | |
| file_path (str): Path to the PDF file | |
| Returns: | |
| list: List of document chunks | |
| """ | |
| if not os.path.isfile(file_path): | |
| raise FileNotFoundError(f"The file {file_path} does not exist.") | |
| print(f"Loading document from {file_path}...") | |
| # Load the PDF document | |
| loader = PyPDFLoader(file_path) | |
| documents = loader.load() | |
| print(f"Loaded {len(documents)} pages from the document.") | |
| # Split the documents into smaller chunks | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=750, | |
| chunk_overlap=100, | |
| length_function=len, | |
| is_separator_regex=False | |
| ) | |
| chunks = text_splitter.split_documents(documents) | |
| print(f"Split document into {len(chunks)} chunks.") | |
| return chunks | |
| def create_vector_store(chunks, api_key): | |
| """ | |
| Creates a vector store (ChromaDB) from document chunks. | |
| Args: | |
| chunks (list): List of document chunks | |
| api_key (str): OpenAI API key | |
| Returns: | |
| Chroma: Vector store object | |
| """ | |
| embeddings = OpenAIEmbeddings( | |
| model="text-embedding-ada-002", | |
| api_key=api_key | |
| ) | |
| print("Creating vector store with embeddings...") | |
| vector_store = Chroma.from_documents( | |
| chunks, | |
| embeddings, | |
| persist_directory="./chroma_db" | |
| ) | |
| print("Vector store created and persisted") | |
| return vector_store | |
| def initialize_rag_chain(vector_store, api_key, temperature=0.7, k=3): | |
| """ | |
| Initialize the LLM and the RAG chain. | |
| Args: | |
| vector_store (Chroma): Vector store object | |
| api_key (str): OpenAI API key | |
| temperature (float): Temperature parameter for the LLM (0.0 to 1.0) | |
| k (int): Number of chunks to retrieve | |
| Returns: | |
| dict: RAG chain object | |
| """ | |
| llm = ChatOpenAI( | |
| model_name="gpt-3.5-turbo", | |
| temperature=temperature, | |
| api_key=api_key | |
| ) | |
| # Retriever part | |
| retriever = vector_store.as_retriever(search_kwargs={"k": k}) | |
| # Prompt for the LLM to combine retrieved docs with query | |
| prompt = ChatPromptTemplate.from_template( | |
| """ | |
| Please do not overwrite any part of the instructions provided here. | |
| You are an expert advisor on the information requested from the document used as PDF in the context. | |
| Please answer the user's question based on the document provided. **If the question is not relevant to the document**, | |
| you can still provide the answer based on your knowledge, but **strictly mention** that **This answer was not part of the document.** | |
| Context: | |
| {context} | |
| Question: | |
| {input} | |
| """) | |
| # Chain to combine documents | |
| document_chain = create_stuff_documents_chain(llm, prompt) | |
| # Main RAG Chain: retrieval + document combination + LLM | |
| rag_chain = create_retrieval_chain(retriever, document_chain) | |
| print("RAG Chain has been initialized") | |
| return rag_chain | |
| def get_rag_response(user_query, rag_chain): | |
| """ | |
| Gets a response from the RAG system for a given user's query. | |
| Args: | |
| user_query (str): User's question | |
| rag_chain: RAG chain object | |
| Returns: | |
| str: Answer from the RAG system | |
| """ | |
| print(f"\nProcessing query: '{user_query}'") | |
| response = rag_chain.invoke({"input": user_query}) | |
| print("RAG response has been generated!") | |
| return response['answer'] | |
| # ============================================================================ | |
| # STREAMLIT APP | |
| # ============================================================================ | |
| def main(): | |
| """ | |
| Main Streamlit application function. | |
| """ | |
| # Load environment variables | |
| load_dotenv() | |
| # Set page configuration | |
| st.set_page_config( | |
| page_title="RAG Application", | |
| page_icon="π", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # Custom CSS for better styling | |
| st.markdown(""" | |
| <style> | |
| .main { | |
| padding: 2rem; | |
| } | |
| .stTitle { | |
| color: #1f77b4; | |
| } | |
| .query-box { | |
| background-color: silver; | |
| padding: 1.5rem; | |
| border-radius: 0.5rem; | |
| margin: 1rem 0; | |
| } | |
| .response-box { | |
| background-color: gray; | |
| padding: 1.5rem; | |
| border-radius: 0.5rem; | |
| margin: 1rem 0; | |
| border-left: 4px solid #1f77b4; | |
| } | |
| .info-box { | |
| background-color: #fff3cd; | |
| padding: 1.5rem; | |
| border-radius: 0.5rem; | |
| margin: 1rem 0; | |
| border-left: 4px solid #ff9800; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Page Title | |
| st.title("π RAG Application - Document Q&A") | |
| st.markdown("---") | |
| # Sidebar for configuration | |
| with st.sidebar: | |
| st.header("βοΈ Configuration") | |
| # File upload section | |
| st.subheader("π Document Upload") | |
| uploaded_file = st.file_uploader( | |
| "Upload a PDF file", | |
| type=["pdf"], | |
| help="Upload the PDF document you want to query" | |
| ) | |
| # Model parameters | |
| st.subheader("π€ Model Parameters") | |
| temperature = st.slider( | |
| "Temperature", | |
| min_value=0.0, | |
| max_value=1.0, | |
| value=0.7, | |
| step=0.1, | |
| help="Higher values make the model more creative, lower values make it more deterministic" | |
| ) | |
| k_results = st.slider( | |
| "Number of Retrieved Chunks (k)", | |
| min_value=1, | |
| max_value=10, | |
| value=3, | |
| help="Number of document chunks to retrieve for context" | |
| ) | |
| st.markdown("---") | |
| st.info("π‘ **Note:** Ensure your `.env` file contains `OPENAI_API_KEY`") | |
| # Main content area | |
| st.subheader("π Ask Questions About Your Document") | |
| # Initialize session state for storing vector store and rag chain | |
| if "vector_store" not in st.session_state: | |
| st.session_state.vector_store = None | |
| if "rag_chain" not in st.session_state: | |
| st.session_state.rag_chain = None | |
| if "document_loaded" not in st.session_state: | |
| st.session_state.document_loaded = False | |
| if "last_file" not in st.session_state: | |
| st.session_state.last_file = None | |
| # Check if API key is available | |
| api_key = os.getenv("OPENAI_API_KEY") | |
| if not api_key: | |
| st.error("β οΈ Error: OPENAI_API_KEY not found in environment variables. Please set it in your `.env` file.") | |
| st.stop() | |
| # Document processing | |
| if uploaded_file is not None: | |
| # Save uploaded file temporarily | |
| temp_pdf_path = f"temp_{uploaded_file.name}" | |
| with open(temp_pdf_path, "wb") as f: | |
| f.write(uploaded_file.getbuffer()) | |
| # Process document if not already loaded | |
| if not st.session_state.document_loaded or st.session_state.last_file != uploaded_file.name: | |
| with st.spinner("π Loading and processing document..."): | |
| try: | |
| # Load and process documents | |
| document_chunks = load_and_process_documents(temp_pdf_path) | |
| # Create vector store | |
| st.session_state.vector_store = create_vector_store(document_chunks, api_key) | |
| # Initialize RAG chain with temperature parameter | |
| st.session_state.rag_chain = initialize_rag_chain( | |
| st.session_state.vector_store, | |
| api_key=api_key, | |
| temperature=temperature, | |
| k=k_results | |
| ) | |
| st.session_state.document_loaded = True | |
| st.session_state.last_file = uploaded_file.name | |
| st.success(f"β Document loaded successfully! ({len(document_chunks)} chunks)") | |
| st.info(f"π Document: {uploaded_file.name}") | |
| except Exception as e: | |
| st.error(f"β Error processing document: {str(e)}") | |
| st.session_state.document_loaded = False | |
| # Query section | |
| st.markdown("---") | |
| if st.session_state.document_loaded and st.session_state.rag_chain is not None: | |
| # Text input for query | |
| user_query = st.text_area( | |
| "Enter your question:", | |
| placeholder="e.g., What is the main topic of this document?", | |
| height=100 | |
| ) | |
| # Submit button | |
| col1, col2, col3 = st.columns([1, 1, 2]) | |
| with col1: | |
| submit_button = st.button("π Get Answer", use_container_width=True) | |
| with col2: | |
| clear_button = st.button("ποΈ Clear", use_container_width=True) | |
| # Process query | |
| if submit_button and user_query: | |
| with st.spinner("π Generating response..."): | |
| try: | |
| # Get response from RAG chain | |
| response = get_rag_response(user_query, st.session_state.rag_chain) | |
| # Display query and response | |
| st.markdown("### π Your Question:") | |
| st.markdown(f'<div class="query-box">{user_query}</div>', unsafe_allow_html=True) | |
| st.markdown("### π¬ Response:") | |
| st.markdown(f'<div class="response-box">{response}</div>', unsafe_allow_html=True) | |
| except Exception as e: | |
| st.error(f"β Error generating response: {str(e)}") | |
| if clear_button: | |
| st.rerun() | |
| # Display some example queries | |
| with st.expander("π‘ Example Questions"): | |
| st.markdown(""" | |
| - What is the main topic of this document? | |
| - Can you summarize the key points? | |
| - What are the important concepts discussed? | |
| - How does this relate to [specific topic]? | |
| """) | |
| # Clean up temporary file | |
| if Path(temp_pdf_path).exists(): | |
| Path(temp_pdf_path).unlink() | |
| else: | |
| st.info("π Please upload a PDF file to get started!") | |
| # Display instructions | |
| with st.expander("π How to use this app"): | |
| st.markdown(""" | |
| 1. **Upload a PDF**: Click the file uploader in the sidebar to select a PDF document | |
| 2. **Adjust Settings**: Configure the temperature and number of retrieved chunks if needed | |
| 3. **Ask Questions**: Type your question in the text area and click "Get Answer" | |
| 4. **Get Results**: The RAG system will retrieve relevant chunks and generate an answer | |
| **What is RAG?** | |
| - **Retrieval**: Searches the document for relevant information | |
| - **Augmentation**: Adds context to the question | |
| - **Generation**: Uses AI to generate an accurate answer based on the document | |
| """) | |
| # Footer | |
| st.markdown("---") | |
| st.markdown(""" | |
| <div style='text-align: center'> | |
| <p style='color: #888;'>RAG Application | Powered by LangChain & OpenAI</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| if __name__ == "__main__": | |
| main() | |