Spaces:

Nav772
/

rag-qa-document

Sleeping

File size: 8,862 Bytes

"""
RAG Document Q&A Assistant
Upload documents, ask questions, get answers with source citations.
"""

import os
import tempfile
from typing import Optional

import chromadb
from pypdf import PdfReader  # PyMuPDF
import gradio as gr
from chromadb.utils import embedding_functions
from openai import OpenAI

# Initialize OpenAI client
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Initialize embedding function
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

# Global state for the current session
chroma_client = None
collection = None
current_chunks = []


def extract_text_from_pdf(file_path: str) -> str:
    """Extract text from PDF using pypdf."""
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() or ""
    return text


def extract_text_from_txt(file_path: str) -> str:
    """Extract text from TXT file."""
    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()


def chunk_fixed_size(text: str, chunk_size: int = 500, overlap: int = 100) -> list[dict]:
    """Split text into fixed-size chunks with overlap."""
    chunks = []
    start = 0
    chunk_id = 0
    
    while start < len(text):
        end = start + chunk_size
        chunk_text = text[start:end].strip()
        
        if chunk_text:
            chunks.append({
                "id": f"chunk_{chunk_id}",
                "text": chunk_text,
                "start": start,
                "end": end
            })
            chunk_id += 1
        
        start = end - overlap
    
    return chunks


def chunk_by_paragraph(text: str) -> list[dict]:
    """Split text by paragraphs (double newlines)."""
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
    
    chunks = []
    for i, para in enumerate(paragraphs):
        if len(para) > 50:
            chunks.append({
                "id": f"chunk_{i}",
                "text": para,
                "start": 0,
                "end": 0
            })
    
    return chunks


def process_document(file, chunking_strategy: str) -> str:
    """Process uploaded document and store in vector DB."""
    global chroma_client, collection, current_chunks
    
    if file is None:
        return "❌ Please upload a document first."
    
    file_path = file.name
    file_ext = os.path.splitext(file_path)[1].lower()
    
    try:
        if file_ext == ".pdf":
            text = extract_text_from_pdf(file_path)
        elif file_ext in [".txt", ".md"]:
            text = extract_text_from_txt(file_path)
        else:
            return f"❌ Unsupported file type: {file_ext}. Please upload PDF or TXT."
    except Exception as e:
        return f"❌ Error reading file: {str(e)}"
    
    if not text.strip():
        return "❌ No text could be extracted from the document."
    
    if chunking_strategy == "Fixed-size (500 chars)":
        current_chunks = chunk_fixed_size(text, chunk_size=500, overlap=100)
    else:
        current_chunks = chunk_by_paragraph(text)
    
    if not current_chunks:
        return "❌ No chunks could be created from the document."
    
    # Initialize fresh Chroma client and collection
    chroma_client = chromadb.Client()
    try:
        chroma_client.delete_collection(name="documents")
    except:
        pass
    collection = chroma_client.create_collection(
        name="documents",
        embedding_function=embedding_func
    )
    
    collection.add(
        documents=[c["text"] for c in current_chunks],
        ids=[c["id"] for c in current_chunks]
    )
    
    return f"✅ Document processed successfully!\n\n📊 **Stats:**\n- Characters: {len(text):,}\n- Chunks created: {len(current_chunks)}\n- Chunking strategy: {chunking_strategy}"


def retrieve_context(query: str, top_k: int = 3) -> list[dict]:
    """Retrieve relevant chunks for the query."""
    if collection is None:
        return []
    
    results = collection.query(
        query_texts=[query],
        n_results=top_k
    )
    
    retrieved = []
    for i, (doc, distance) in enumerate(zip(
        results["documents"][0],
        results["distances"][0]
    )):
        similarity = 1 / (1 + distance)
        retrieved.append({
            "text": doc,
            "similarity": similarity,
            "rank": i + 1
        })
    
    return retrieved


def generate_answer(query: str, context_docs: list[dict]) -> str:
    """Generate answer using OpenAI with retrieved context."""
    if not context_docs:
        return "I don't have any context to answer this question. Please upload a document first."
    
    context = "\n\n".join([
        f"[Source {doc['rank']}] (relevance: {doc['similarity']:.0%})\n{doc['text']}"
        for doc in context_docs
    ])
    
    prompt = f"""Answer the question based on the provided context. 
If the context doesn't contain enough information to answer fully, say so.
Always reference which source(s) you used.

CONTEXT:
{context}

QUESTION: {query}

ANSWER:"""

    try:
        response = openai_client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that answers questions based on provided document context. Be concise and cite your sources."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
            max_tokens=500
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"❌ Error generating answer: {str(e)}"


def ask_question(query: str) -> tuple[str, str]:
    """Main function to handle user questions."""
    if not query.strip():
        return "Please enter a question.", ""
    
    if collection is None:
        return "Please upload and process a document first.", ""
    
    retrieved = retrieve_context(query, top_k=3)
    answer = generate_answer(query, retrieved)
    
    sources = "\n\n---\n\n**📚 Retrieved Sources:**\n\n"
    for doc in retrieved:
        sources += f"**[Source {doc['rank']}]** (relevance: {doc['similarity']:.0%})\n"
        sources += f"```\n{doc['text'][:300]}{'...' if len(doc['text']) > 300 else ''}\n```\n\n"
    
    return answer, sources


# Build Gradio interface
with gr.Blocks(title="RAG Document Q&A", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 📄 RAG Document Q&A Assistant
    
    Upload a document (PDF or TXT), choose a chunking strategy, and ask questions!
    
    **How it works:**
    1. Your document is split into chunks using the selected strategy
    2. Chunks are embedded using Sentence Transformers (all-MiniLM-L6-v2)
    3. When you ask a question, relevant chunks are retrieved using semantic search
    4. GPT-4o-mini generates an answer based on the retrieved context
    
    ---
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 📤 Step 1: Upload Document")
            file_input = gr.File(
                label="Upload PDF or TXT",
                file_types=[".pdf", ".txt", ".md"]
            )
            chunking_dropdown = gr.Dropdown(
                choices=["Fixed-size (500 chars)", "Paragraph-based"],
                value="Paragraph-based",
                label="Chunking Strategy"
            )
            process_btn = gr.Button("Process Document", variant="primary")
            process_output = gr.Markdown(label="Processing Status")
        
        with gr.Column(scale=2):
            gr.Markdown("### 💬 Step 2: Ask Questions")
            question_input = gr.Textbox(
                label="Your Question",
                placeholder="What is this document about?",
                lines=2
            )
            ask_btn = gr.Button("Ask", variant="primary")
            
            answer_output = gr.Markdown(label="Answer")
            sources_output = gr.Markdown(label="Sources")
    
    gr.Markdown("""
    ---
    
    **📚 References:**
    - [RAG Original Paper (Lewis et al., 2020)](https://arxiv.org/abs/2005.11401)
    - [RAG Survey (Gao et al., 2023)](https://arxiv.org/pdf/2312.10997)
    - [Chunking Strategies for RAG (Merola & Singh, 2025)](https://arxiv.org/abs/2504.19754)
    
    Built as part of an AI/ML Engineering portfolio project.
    """)
    
    process_btn.click(
        fn=process_document,
        inputs=[file_input, chunking_dropdown],
        outputs=[process_output]
    )
    
    ask_btn.click(
        fn=ask_question,
        inputs=[question_input],
        outputs=[answer_output, sources_output]
    )
    
    question_input.submit(
        fn=ask_question,
        inputs=[question_input],
        outputs=[answer_output, sources_output]
    )


if __name__ == "__main__":
    demo.launch()