Spaces:

avimittal30
/

FinQuery

Build error

App Files Files Community

avimittal30 commited on Apr 4, 2025

Commit

a7aaec4

1 Parent(s): 793774f

pushing files

Browse files

Files changed (4) hide show

app.py +144 -0
data.py +92 -0
helper.py +115 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import streamlit as st
+import os
+import tempfile
+import pickle
+import faiss
+import numpy as np
+from helper import extract_text_from_pdf, chunk_text, embedding_function, embedding_model, query_llm_with_context
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Set page configuration
+st.set_page_config(
+    page_title="PDF RAG System",
+    page_icon="📚",
+    layout="wide"
+)
+# Title and description
+st.title("📚 PDF RAG System")
+st.markdown("""
+This application allows you to upload a PDF file, ask questions about its content, and get AI-generated answers based on the document.
+""")
+# File upload section
+st.header("1. Upload PDF")
+uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
+# Initialize session state variables
+if 'pdf_processed' not in st.session_state:
+    st.session_state.pdf_processed = False
+if 'index' not in st.session_state:
+    st.session_state.index = None
+if 'chunks' not in st.session_state:
+    st.session_state.chunks = None
+if 'pdf_path' not in st.session_state:
+    st.session_state.pdf_path = None
+# Process the uploaded PDF
+if uploaded_file is not None and not st.session_state.pdf_processed:
+    with st.spinner("Processing PDF..."):
+        # Create a temporary file to save the uploaded PDF
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
+            tmp_file.write(uploaded_file.getvalue())
+            st.session_state.pdf_path = tmp_file.name
+        # Extract text from PDF
+        pdf_text = extract_text_from_pdf(st.session_state.pdf_path)
+        # Chunk the text
+        chunks = chunk_text(pdf_text, chunk_size=1000, chunk_overlap=100)
+        st.session_state.chunks = chunks
+        # Create embeddings
+        embeddings = embedding_function(chunks)
+        # Convert embeddings to numpy array if they aren't already
+        if not isinstance(embeddings, np.ndarray):
+            embeddings = np.array(embeddings).astype('float32')
+        # Get the dimension of the embeddings
+        dimension = embeddings.shape[1]
+        # Initialize FAISS index
+        index = faiss.IndexFlatL2(dimension)
+        # Add vectors to the index
+        index.add(embeddings)
+        # Save the index and chunks
+        faiss.write_index(index, "./faiss_index")
+        with open("./document_chunks.pkl", 'wb') as f:
+            pickle.dump(chunks, f)
+        # Update session state
+        st.session_state.index = index
+        st.session_state.pdf_processed = True
+        st.success(f"PDF processed successfully! {len(chunks)} chunks created.")
+# Query section
+st.header("2. Ask a Question")
+query = st.text_input("Enter your question about the PDF content:")
+# Add a button to submit the query
+if st.button("Get Answer") and query and st.session_state.pdf_processed:
+    with st.spinner("Retrieving relevant information and generating answer..."):
+        try:
+            # Generate embedding for the query
+            query_embedding = embedding_model.encode([query], convert_to_numpy=True).astype('float32')
+            # Search the index
+            n_results = 5
+            distances, indices = st.session_state.index.search(query_embedding, n_results)
+            # Get the documents
+            documents = [st.session_state.chunks[i] for i in indices[0]]
+            # Convert distances to similarity scores (L2 distance: lower is better)
+            # Normalize distances to [0, 1] range where 1 is most similar
+            max_distance = np.max(distances)
+            similarity_scores = [1 - (dist / max_distance) for dist in distances[0]]
+            # Create context from retrieved documents
+            context = (documents, similarity_scores)
+            # Query the LLM with context
+            answer = query_llm_with_context(query, context, top_n=3)
+            # Display the answer
+            st.header("3. Answer")
+            st.write(answer)
+            # Display the retrieved documents
+            with st.expander("View Retrieved Documents"):
+                for i, (doc, score) in enumerate(zip(documents, similarity_scores)):
+                    st.markdown(f"**Document {i+1}** (Relevance: {score:.4f})")
+                    st.text(doc[:500] + "..." if len(doc) > 500 else doc)
+                    st.markdown("---")
+        except Exception as e:
+            st.error(f"An error occurred: {str(e)}")
+            logger.exception("Error during query processing")
+# Add a reset button
+if st.button("Reset and Upload New PDF"):
+    # Clean up temporary files
+    if st.session_state.pdf_path and os.path.exists(st.session_state.pdf_path):
+        os.unlink(st.session_state.pdf_path)
+    # Reset session state
+    st.session_state.pdf_processed = False
+    st.session_state.index = None
+    st.session_state.chunks = None
+    st.session_state.pdf_path = None
+    # Reload the page
+    st.experimental_rerun()
+# Footer
+st.markdown("---")
+st.markdown("Built with Streamlit, FAISS, and Ollama")

data.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from helper import extract_text_from_pdf, chunk_text, embedding_function, embedding_model, generate_hypothetical_answer, query_llm_with_context
+import numpy as np
+import faiss
+import pickle
+import os
+import logging
+from helper import query_llm_with_context
+logging.basicConfig(level=logging.INFO)
+# Path for storing the FAISS index and document chunks
+index_path = "./faiss_index"
+chunks_path = "./document_chunks.pkl"
+pdf_path = 'C:\Git Projects\AnnualReport_rag\IBM.pdf'
+print('Extracting text from pdf...')
+pdf_text = extract_text_from_pdf(pdf_path)
+print('Chunking pdf...')
+chunks = chunk_text(pdf_text, chunk_size=1000, chunk_overlap=100)
+print('Embedding chunks...')
+embeddings = embedding_function(chunks)
+print(f"Embeddings type: {type(embeddings)}")
+print(f"First embedding type: {type(embeddings[0])}")
+print(f"First embedding shape or length: {len(embeddings[0]) if hasattr(embeddings[0], '__len__') else 'unknown'}")
+# Convert embeddings to numpy array if they aren't already
+if not isinstance(embeddings, np.ndarray):
+    print("Converting embeddings to numpy array...")
+    embeddings = np.array(embeddings).astype('float32')
+# Get the dimension of the embeddings
+dimension = embeddings.shape[1]
+print(f"Embedding dimension: {dimension}")
+# Initialize FAISS index
+print('Initializing FAISS index...')
+index = faiss.IndexFlatL2(dimension)  # L2 distance for similarity search
+# Add vectors to the index
+print('Adding vectors to FAISS index...')
+index.add(embeddings)
+# Save the index
+print('Saving FAISS index...')
+faiss.write_index(index, index_path)
+# Save the document chunks for retrieval
+print('Saving document chunks...')
+with open(chunks_path, 'wb') as f:
+    pickle.dump(chunks, f)
+print(f"Total vectors in index: {index.ntotal}")
+def retrieve_documents(query, n_results=5):
+    # Generate embedding for the query
+    query_embedding = embedding_model.encode([query], convert_to_numpy=True).astype('float32')
+    # Search the index
+    distances, indices = index.search(query_embedding, n_results)
+    # Get the documents
+    documents = [chunks[i] for i in indices[0]]
+    # Convert distances to similarity scores (L2 distance: lower is better)
+    # Normalize distances to [0, 1] range where 1 is most similar
+    max_distance = np.max(distances)
+    similarity_scores = [1 - (dist / max_distance) for dist in distances[0]]
+    return documents, similarity_scores
+# Test the retrieval
+query="how has the profitability of the company been in last five years"
+print('Retrieving documents...')
+general_docs, general_scores = retrieve_documents(query, n_results=15)
+print(f"Number of docs returned for general query: {len(general_docs)}")
+# Print the results
+# for i, (doc, score) in enumerate(zip(general_docs, general_scores)):
+#     print(f"\nResult {i+1} (Score: {score:.4f}):")
+#     print(f"{doc[:200]}...")
+new_query=query+generate_hypothetical_answer(query)
+combined_context=retrieve_documents(new_query, n_results=15)
+answer = query_llm_with_context(query, combined_context, top_n=3)
+print('final_response:{answer}')

helper.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from sentence_transformers import SentenceTransformer
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from pypdf import PdfReader
+import requests
+import json
+def extract_text_from_pdf(pdf_path):
+    reader = PdfReader(pdf_path)
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text() + "\n"
+    return text.strip()
+def chunk_text(text, chunk_size=500, chunk_overlap=100):
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,  # Overlap to preserve context
+        separators=["\n\n", "\n", " ", ""],  # Prioritize logical breaks
+    )
+    return splitter.split_text(text)
+embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+def embedding_function(texts):
+    return embedding_model.encode(texts, convert_to_numpy=True).tolist()
+def generate_hypothetical_answer(query):
+    import requests
+    import json
+    # Ollama API endpoint (default is localhost:11434)
+    ollama_url = "http://localhost:11434/api/generate"
+    # Prepare the prompt
+    prompt = f"Generate a plausible answer to the question:\n\n{query}\n\nAnswer:"
+    # Prepare the request payload
+    payload = {
+        "model": "llama2",  # or any other model you have pulled in Ollama
+        "prompt": prompt,
+        "stream": False
+    }
+    try:
+        # Make the API request to Ollama
+        response = requests.post(ollama_url, json=payload)
+        response.raise_for_status()  # Raise an exception for HTTP errors
+        # Parse the response
+        result = response.json()
+        # Extract the generated text
+        generated_text = result.get("response", "")
+        return generated_text.strip()
+    except Exception as e:
+        print(f"Error generating hypothetical answer: {e}")
+        return "Failed to generate a hypothetical answer."
+def query_llm_with_context(query,context,top_n=3):
+    # Get documents sorted by similarity
+    sorted_docs, sorted_scores = context
+    # Use only the top N documents
+    top_docs = sorted_docs[:top_n]
+    # Create a context string by joining the top documents
+    context = "\n\n===Document Boundary===\n\n".join(top_docs)
+    # Create a prompt with the context and query
+    prompt = f"""
+    Context information is below.
+    ---------------------
+    {context}
+    ---------------------
+    Given the context information and not prior knowledge, answer the following query:
+    Query: {query}
+    """
+    # Call Ollama API instead of OpenAI
+    ollama_url = "http://localhost:11434/api/generate"
+    # Prepare the request payload
+    payload = {
+        "model": "llama2",  # or any other model you have pulled in Ollama
+        "prompt": prompt,
+        "stream": False
+    }
+    try:
+        # Make the API request to Ollama
+        response = requests.post(ollama_url, json=payload)
+        response.raise_for_status()  # Raise an exception for HTTP errors
+        # Parse the response
+        result = response.json()
+        # Extract the generated text
+        generated_text = result.get("response", "")
+        return generated_text.strip()
+    except Exception as e:
+        print(f"Error querying LLM with context: {e}")
+        return "Failed to generate an answer with the provided context."

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+sentence-transformers
+chromadb
+pypdf
+langchain
+openai
+faiss-cpu