Spaces:

Nav772
/

rag-qa-document

Sleeping

App Files Files Community

Navneet Sai commited on Feb 14

Commit

1bfb382

1 Parent(s): de95ad8

Initial RAG App

Browse files

Files changed (3) hide show

README.md +32 -7
app.py +291 -0
requirements.txt +5 -0

README.md CHANGED Viewed

@@ -1,13 +1,38 @@
 ---
-title: Rag Qa Document
-emoji: 🏢
-colorFrom: purple
-colorTo: blue
 sdk: gradio
-sdk_version: 6.5.1
 app_file: app.py
 pinned: false
-license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: RAG Document Q&A Assistant
+emoji: 📄
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: 4.44.1
 app_file: app.py
 pinned: false
 ---
+# RAG Document Q&A Assistant
+Upload a PDF or TXT document and ask questions about its content.
+## How It Works
+1. **Document Processing**: Your document is split into chunks using the selected strategy (fixed-size or paragraph-based)
+2. **Embedding**: Chunks are embedded using Sentence Transformers (all-MiniLM-L6-v2)
+3. **Retrieval**: When you ask a question, relevant chunks are retrieved using semantic search via ChromaDB
+4. **Generation**: GPT-4o-mini generates an answer based on the retrieved context
+## Features
+- PDF and TXT file support
+- Two chunking strategies for comparison
+- Source citations with relevance scores
+- Built with Gradio, ChromaDB, and OpenAI API
+## References
+- [RAG Original Paper (Lewis et al., 2020)](https://arxiv.org/abs/2005.11401)
+- [RAG Survey (Gao et al., 2023)](https://arxiv.org/pdf/2312.10997)
+- [Chunking Strategies for RAG (Merola & Singh, 2025)](https://arxiv.org/abs/2504.19754)
+## Author
+Built as part of an AI/ML Engineering portfolio project.

app.py ADDED Viewed

	@@ -0,0 +1,291 @@

+"""
+RAG Document Q&A Assistant
+Upload documents, ask questions, get answers with source citations.
+"""
+import os
+import tempfile
+from typing import Optional
+import chromadb
+import fitz  # PyMuPDF
+import gradio as gr
+from chromadb.utils import embedding_functions
+from openai import OpenAI
+# Initialize OpenAI client
+openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+# Initialize embedding function
+embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
+    model_name="all-MiniLM-L6-v2"
+)
+# Global state for the current session
+chroma_client = None
+collection = None
+current_chunks = []
+def extract_text_from_pdf(file_path: str) -> str:
+    """Extract text from PDF using PyMuPDF."""
+    doc = fitz.open(file_path)
+    text = ""
+    for page in doc:
+        text += page.get_text()
+    doc.close()
+    return text
+def extract_text_from_txt(file_path: str) -> str:
+    """Extract text from TXT file."""
+    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+        return f.read()
+def chunk_fixed_size(text: str, chunk_size: int = 500, overlap: int = 100) -> list[dict]:
+    """Split text into fixed-size chunks with overlap."""
+    chunks = []
+    start = 0
+    chunk_id = 0
+    while start < len(text):
+        end = start + chunk_size
+        chunk_text = text[start:end].strip()
+        if chunk_text:
+            chunks.append({
+                "id": f"chunk_{chunk_id}",
+                "text": chunk_text,
+                "start": start,
+                "end": end
+            })
+            chunk_id += 1
+        start = end - overlap
+    return chunks
+def chunk_by_paragraph(text: str) -> list[dict]:
+    """Split text by paragraphs (double newlines)."""
+    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
+    chunks = []
+    for i, para in enumerate(paragraphs):
+        if len(para) > 50:
+            chunks.append({
+                "id": f"chunk_{i}",
+                "text": para,
+                "start": 0,
+                "end": 0
+            })
+    return chunks
+def process_document(file, chunking_strategy: str) -> str:
+    """Process uploaded document and store in vector DB."""
+    global chroma_client, collection, current_chunks
+    if file is None:
+        return "❌ Please upload a document first."
+    file_path = file.name
+    file_ext = os.path.splitext(file_path)[1].lower()
+    try:
+        if file_ext == ".pdf":
+            text = extract_text_from_pdf(file_path)
+        elif file_ext in [".txt", ".md"]:
+            text = extract_text_from_txt(file_path)
+        else:
+            return f"❌ Unsupported file type: {file_ext}. Please upload PDF or TXT."
+    except Exception as e:
+        return f"❌ Error reading file: {str(e)}"
+    if not text.strip():
+        return "❌ No text could be extracted from the document."
+    if chunking_strategy == "Fixed-size (500 chars)":
+        current_chunks = chunk_fixed_size(text, chunk_size=500, overlap=100)
+    else:
+        current_chunks = chunk_by_paragraph(text)
+    if not current_chunks:
+        return "❌ No chunks could be created from the document."
+    # Initialize fresh Chroma client and collection
+    chroma_client = chromadb.Client()
+    try:
+        chroma_client.delete_collection(name="documents")
+    except:
+        pass
+    collection = chroma_client.create_collection(
+        name="documents",
+        embedding_function=embedding_func
+    )
+    collection.add(
+        documents=[c["text"] for c in current_chunks],
+        ids=[c["id"] for c in current_chunks]
+    )
+    return f"✅ Document processed successfully!\n\n📊 **Stats:**\n- Characters: {len(text):,}\n- Chunks created: {len(current_chunks)}\n- Chunking strategy: {chunking_strategy}"
+def retrieve_context(query: str, top_k: int = 3) -> list[dict]:
+    """Retrieve relevant chunks for the query."""
+    if collection is None:
+        return []
+    results = collection.query(
+        query_texts=[query],
+        n_results=top_k
+    )
+    retrieved = []
+    for i, (doc, distance) in enumerate(zip(
+        results["documents"][0],
+        results["distances"][0]
+    )):
+        similarity = 1 / (1 + distance)
+        retrieved.append({
+            "text": doc,
+            "similarity": similarity,
+            "rank": i + 1
+        })
+    return retrieved
+def generate_answer(query: str, context_docs: list[dict]) -> str:
+    """Generate answer using OpenAI with retrieved context."""
+    if not context_docs:
+        return "I don't have any context to answer this question. Please upload a document first."
+    context = "\n\n".join([
+        f"[Source {doc['rank']}] (relevance: {doc['similarity']:.0%})\n{doc['text']}"
+        for doc in context_docs
+    ])
+    prompt = f"""Answer the question based on the provided context.
+If the context doesn't contain enough information to answer fully, say so.
+Always reference which source(s) you used.
+CONTEXT:
+{context}
+QUESTION: {query}
+ANSWER:"""
+    try:
+        response = openai_client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant that answers questions based on provided document context. Be concise and cite your sources."},
+                {"role": "user", "content": prompt}
+            ],
+            temperature=0.3,
+            max_tokens=500
+        )
+        return response.choices[0].message.content
+    except Exception as e:
+        return f"❌ Error generating answer: {str(e)}"
+def ask_question(query: str) -> tuple[str, str]:
+    """Main function to handle user questions."""
+    if not query.strip():
+        return "Please enter a question.", ""
+    if collection is None:
+        return "Please upload and process a document first.", ""
+    retrieved = retrieve_context(query, top_k=3)
+    answer = generate_answer(query, retrieved)
+    sources = "\n\n---\n\n**📚 Retrieved Sources:**\n\n"
+    for doc in retrieved:
+        sources += f"**[Source {doc['rank']}]** (relevance: {doc['similarity']:.0%})\n"
+        sources += f"```\n{doc['text'][:300]}{'...' if len(doc['text']) > 300 else ''}\n```\n\n"
+    return answer, sources
+# Build Gradio interface
+with gr.Blocks(title="RAG Document Q&A", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 📄 RAG Document Q&A Assistant
+    Upload a document (PDF or TXT), choose a chunking strategy, and ask questions!
+    **How it works:**
+    1. Your document is split into chunks using the selected strategy
+    2. Chunks are embedded using Sentence Transformers (all-MiniLM-L6-v2)
+    3. When you ask a question, relevant chunks are retrieved using semantic search
+    4. GPT-4o-mini generates an answer based on the retrieved context
+    ---
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### 📤 Step 1: Upload Document")
+            file_input = gr.File(
+                label="Upload PDF or TXT",
+                file_types=[".pdf", ".txt", ".md"]
+            )
+            chunking_dropdown = gr.Dropdown(
+                choices=["Fixed-size (500 chars)", "Paragraph-based"],
+                value="Paragraph-based",
+                label="Chunking Strategy"
+            )
+            process_btn = gr.Button("Process Document", variant="primary")
+            process_output = gr.Markdown(label="Processing Status")
+        with gr.Column(scale=2):
+            gr.Markdown("### 💬 Step 2: Ask Questions")
+            question_input = gr.Textbox(
+                label="Your Question",
+                placeholder="What is this document about?",
+                lines=2
+            )
+            ask_btn = gr.Button("Ask", variant="primary")
+            answer_output = gr.Markdown(label="Answer")
+            sources_output = gr.Markdown(label="Sources")
+    gr.Markdown("""
+    ---
+    **📚 References:**
+    - [RAG Original Paper (Lewis et al., 2020)](https://arxiv.org/abs/2005.11401)
+    - [RAG Survey (Gao et al., 2023)](https://arxiv.org/pdf/2312.10997)
+    - [Chunking Strategies for RAG (Merola & Singh, 2025)](https://arxiv.org/abs/2504.19754)
+    Built as part of an AI/ML Engineering portfolio project.
+    """)
+    process_btn.click(
+        fn=process_document,
+        inputs=[file_input, chunking_dropdown],
+        outputs=[process_output]
+    )
+    ask_btn.click(
+        fn=ask_question,
+        inputs=[question_input],
+        outputs=[answer_output, sources_output]
+    )
+    question_input.submit(
+        fn=ask_question,
+        inputs=[question_input],
+        outputs=[answer_output, sources_output]
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio>=4.0.0
+chromadb>=0.4.0
+sentence-transformers>=2.2.0
+openai>=1.0.0
+pymupdf>=1.23.0