Spaces:

kviraj722
/

rag-based-app

Sleeping

App Files Files Community

viraj commited on May 16, 2025

Commit

52e9bab

1 Parent(s): aa870e0

enhancements

Browse files

Files changed (3) hide show

.gitignore +3 -1
app.py +50 -14
rag_pipeline.py +88 -19

.gitignore CHANGED Viewed

@@ -1,2 +1,4 @@
 .env
-__pycache__

 .env
+__pycache__
+chroma_db
+files

app.py CHANGED Viewed

@@ -61,20 +61,56 @@ async def query_endpoint(request = Body(...)):
         raise HTTPException(status_code=422, detail="Missing file_id or question")
     retriever_path = f"{CHROMA_DIR}/{file_id}"
-    # Load retriever from disk
     if not os.path.exists(retriever_path):
-        return {"error": "Vectorstore for this file_id not found."}
-    vectorstore = Chroma(
-        embedding_function=embedding_model,
-        persist_directory=retriever_path
-    )
-    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4})
-    retrieved_docs = retriever.invoke(selected_text or question)
-    retrieved_context = "\n\n".join(
-        re.sub(r"\s+", " ", doc.page_content.strip()) for doc in retrieved_docs
-    )
-    combined_context = f"User selected this:\n\"{selected_text}\"\n\nRelated parts from the document:\n{retrieved_context}"
-    answer = answer_query(question, combined_context, explain_like_5)
-    return {"answer": answer}

         raise HTTPException(status_code=422, detail="Missing file_id or question")
     retriever_path = f"{CHROMA_DIR}/{file_id}"
     if not os.path.exists(retriever_path):
+        raise HTTPException(status_code=404, detail="Vectorstore for this file_id not found.")
+    try:
+        # Initialize vectorstore with metadata filtering
+        vectorstore = Chroma(
+            embedding_function=embedding_model,
+            persist_directory=retriever_path
+        )
+        # Configure retriever with MMR search
+        retriever = vectorstore.as_retriever(
+            search_type="mmr",
+            search_kwargs={
+                "k": 4,
+                "fetch_k": 8,
+                "lambda_mult": 0.7,
+            }
+        )
+        # First, get context around selected text if it exists
+        contexts = []
+        if selected_text:
+            selected_results = retriever.invoke(selected_text)
+            contexts.extend([doc.page_content for doc in selected_results])
+        # Then get context for the question
+        question_results = retriever.invoke(question)
+        contexts.extend([doc.page_content for doc in question_results])
+        # Remove duplicates while preserving order
+        contexts = list(dict.fromkeys(contexts))
+        # Format the context with clear section separation
+        formatted_context = ""
+        if selected_text:
+            formatted_context += f"Selected Text Context:\n{selected_text}\n\n"
+        formatted_context += "Related Document Contexts:\n" + "\n---\n".join(
+            re.sub(r"\s+", " ", context.strip())
+            for context in contexts
+        )
+        # Get the answer using the enhanced context
+        answer = answer_query(question, formatted_context, explain_like_5)
+        return {
+            "answer": answer,
+            "context_used": formatted_context  # Optionally return context for debugging
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error processing query: {str(e)}")

rag_pipeline.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import tempfile
 from langchain_chroma import Chroma
-from langchain_community.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import os
 from langchain_huggingface import HuggingFaceEmbeddings
@@ -19,36 +19,105 @@ def process_file(file_bytes, filename, file_id):
         tmp.write(file_bytes)
         tmp_path = tmp.name
-    loader = PyPDFLoader(tmp_path) if ext == 'pdf' else None
-    docs = loader.load()
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
     chunks = text_splitter.split_documents(docs)
     vectorstore = Chroma.from_documents(
         documents=chunks,
         embedding=embedding_model,
-        persist_directory=f"{CHROMA_DIR}/{file_id}"
     )
-    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4})
     os.unlink(tmp_path)
     return retriever
 def answer_query(question, context, explain_like_5=False):
     system_prompt = (
-        "You are a helpful assistant answering user queries based on provided document chunks.\n"
-        "Only use the given context. If the answer is not found, respond with 'I don't know.'"
     )
     if explain_like_5:
-        system_prompt += "\nExplain the answer in a simple way, like you're talking to a 5-year-old."
-    # Step 2: Send to LLM
-    response = client.chat.completions.create(
-        model="llama-3.3-70b-versatile",
-        messages=[
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": f"Context:\n{context}\n\nQuestion:\n{question}"}
-        ]
-    )
-    return response.choices[0].message.content

 import tempfile
 from langchain_chroma import Chroma
+from langchain_community.document_loaders import PyPDFLoader, TextLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import os
 from langchain_huggingface import HuggingFaceEmbeddings
         tmp.write(file_bytes)
         tmp_path = tmp.name
+        print("Processing file:", filename)
+        if ext == 'pdf':
+            loader = PyPDFLoader(tmp_path)
+        elif ext == 'txt':
+            loader = TextLoader(tmp_path, encoding='utf-8')
+        else:
+            os.unlink(tmp_path)
+            raise ValueError(f"Unsupported file type: .{ext}")
+    docs = loader.load()
+    # Enhanced text splitting strategy
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=500,  # Smaller chunks for more precise retrieval
+        chunk_overlap=50,  # Reduced overlap but still maintaining context
+        length_function=len,
+        separators=["\n\n", "\n", " ", ""],
+        add_start_index=True  # This helps maintain position information
+    )
     chunks = text_splitter.split_documents(docs)
+    # Enhance metadata for each chunk
+    for i, chunk in enumerate(chunks):
+        chunk.metadata.update({
+            "chunk_id": i,
+            "file_id": file_id,
+            "filename": filename,
+            "source": filename,
+            "chunk_type": "document"
+        })
+    # Create Chroma collection with enhanced metadata and filtering
     vectorstore = Chroma.from_documents(
         documents=chunks,
         embedding=embedding_model,
+        persist_directory=f"{CHROMA_DIR}/{file_id}",
+        collection_metadata={
+            "file_id": file_id,
+            "filename": filename,
+            "hnsw_space": "cosine",  # Explicitly set distance metric
+            "document_type": ext
+        }
     )
+    # Configure retriever with metadata filtering capability
+    retriever = vectorstore.as_retriever(
+        search_type="mmr",  # Use MMR for diversity in results
+        search_kwargs={
+            "k": 4,
+            "fetch_k": 8,  # Fetch more candidates for MMR
+            "lambda_mult": 0.7,  # Balance between relevance and diversity
+        }
+    )
     os.unlink(tmp_path)
     return retriever
 def answer_query(question, context, explain_like_5=False):
+    # Validate inputs
+    if not question or not context:
+        raise ValueError("Question and context must not be empty")
+    if not isinstance(context, (str, list)):
+        raise TypeError("Context must be a string or list")
+    # Format context if it's a list
+    if isinstance(context, list):
+        context = "\n\n".join(str(c) for c in context)
     system_prompt = (
+        "You are a helpful assistant answering user queries based STRICTLY on the provided document chunks.\n"
+        "IMPORTANT RULES:\n"
+        "1. ONLY use information from the given context. Do not use any external knowledge.\n"
+        "2. If the answer cannot be fully derived from the context, say 'I cannot answer this question based on the provided context.'\n"
+        "3. If you're unsure about any part of the answer, acknowledge the uncertainty.\n"
+        "4. Do not make assumptions beyond what's explicitly stated in the context.\n"
+        "5. Quote relevant parts of the context to support your answers when possible."
     )
     if explain_like_5:
+        system_prompt += "\nExplain the answer in a simple way, like you're talking to a 5-year-old, but still only use information from the context."
+    print("Context:", context)
+    try:
+        # Send to LLM with formatted prompt
+        response = client.chat.completions.create(
+            model="llama-3.3-70b-versatile",
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": (
+                    f"Context:\n{context}\n\n"
+                    f"Question: {question}\n\n"
+                    "Remember to answer ONLY based on the information provided in the context above. "
+                    "If you cannot find the answer in the context, say so explicitly."
+                )}
+            ],
+            temperature=0.3  # Lower temperature for more focused answers
+        )
+        return response.choices[0].message.content
+    except Exception as e:
+        raise Exception(f"Error generating answer: {str(e)}")