Spaces:

Rahaf2001
/

software-Documentation-RAG-System

Sleeping

File size: 7,873 Bytes

import gradio as gr
import requests
from bs4 import BeautifulSoup
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from typing import List, Tuple
import re

model = SentenceTransformer('all-MiniLM-L6-v2')

doc_chunks = []
doc_embeddings = None
index = None
source_url = ""

def fetch_documentation(url: str) -> str:
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'DNT': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }
        response = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        for script in soup(["script", "style", "nav", "footer", "header"]):
            script.decompose()
        
        text = soup.get_text()
        
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = '\n'.join(chunk for chunk in chunks if chunk)
        
        return text
    except Exception as e:
        error_msg = str(e)
        if "403" in error_msg or "Forbidden" in error_msg:
            raise Exception(f"Access denied (403 Forbidden). This website blocks automated requests. Try: 1) Using the site's API if available, 2) A different documentation page, 3) GitHub raw content URLs work well (e.g., https://raw.githubusercontent.com/...)")
        elif "404" in error_msg:
            raise Exception(f"Page not found (404). Please check the URL is correct.")
        elif "timeout" in error_msg.lower():
            raise Exception(f"Request timeout. The website took too long to respond.")
        else:
            raise Exception(f"Error fetching URL: {error_msg}")

def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
    sentences = re.split(r'[.!?]+', text)
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue
            
        if len(current_chunk) + len(sentence) < chunk_size:
            current_chunk += sentence + ". "
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

def process_documentation(url: str) -> str:
    global doc_chunks, doc_embeddings, index, source_url
    
    if not url:
        return "Please provide a URL"
    
    try:
        status = "Fetching documentation..."
        print(status)
        
        text = fetch_documentation(url)
        
        if len(text) < 100:
            return "Retrieved content is too short. Please check the URL."
        
        status = "Chunking text..."
        print(status)
        
        doc_chunks = chunk_text(text)
        
        if not doc_chunks:
            return "No content chunks created. The documentation might be empty."
        
        status = f"Creating embeddings for {len(doc_chunks)} chunks..."
        print(status)
        
        doc_embeddings = model.encode(doc_chunks, show_progress_bar=False)
        
        dimension = doc_embeddings.shape[1]
        index = faiss.IndexFlatL2(dimension)
        index.add(doc_embeddings.astype('float32'))
        
        source_url = url
        
        return f"Documentation processed successfully!\n\nStatistics:\n- Chunks created: {len(doc_chunks)}\n- Text length: {len(text)} characters\n- Ready to answer questions!"
        
    except Exception as e:
        return f"Error: {str(e)}"

def answer_question(question: str, top_k: int = 3) -> Tuple[str, str]:
    global doc_chunks, doc_embeddings, index, source_url
    
    if not question:
        return "Please enter a question", ""
    
    if index is None or not doc_chunks:
        return "Please process documentation first by entering a URL above", ""
    
    try:
        question_embedding = model.encode([question])
        
        distances, indices = index.search(question_embedding.astype('float32'), top_k)
        
        relevant_chunks = [doc_chunks[i] for i in indices[0]]
        
        context = "\n\n".join([f"[{i+1}] {chunk}" for i, chunk in enumerate(relevant_chunks)])
        
        answer = f"Based on the documentation at {source_url}:\n\n"
        answer += f"Relevant Information:\n\n{relevant_chunks[0]}"
        
        if len(relevant_chunks) > 1:
            answer += f"\n\nAdditional Context:\n\n{relevant_chunks[1]}"
        
        sources = "Retrieved Chunks:\n\n"
        for i, (chunk, dist) in enumerate(zip(relevant_chunks, distances[0])):
            sources += f"Chunk {i+1} (similarity: {1/(1+dist):.3f}):\n{chunk}\n\n---\n\n"
        
        return answer, sources
        
    except Exception as e:
        return f"Error: {str(e)}", ""

with gr.Blocks(theme=gr.themes.Soft(), title="Documentation RAG System") as demo:
    gr.Markdown("# Documentation RAG System\n\nEnter a documentation URL, process it, then ask questions about the content using AI-powered retrieval.")
    
    with gr.Row():
        with gr.Column():
            url_input = gr.Textbox(
                label="Documentation URL",
                placeholder="https://docs.python.org/3/tutorial/index.html",
                lines=1
            )
            process_btn = gr.Button("Process Documentation", variant="primary")
            status_output = gr.Textbox(
                label="Status",
                lines=6,
                interactive=False
            )
    
    gr.Markdown("---")
    
    with gr.Row():
        with gr.Column():
            question_input = gr.Textbox(
                label="Your Question",
                placeholder="What is this documentation about?",
                lines=3
            )
            
            top_k_slider = gr.Slider(
                minimum=1,
                maximum=5,
                value=3,
                step=1,
                label="Number of chunks to retrieve"
            )
            
            ask_btn = gr.Button("Ask Question", variant="primary")
    
    with gr.Row():
        with gr.Column():
            answer_output = gr.Textbox(
                label="Answer",
                lines=10,
                interactive=False
            )
        
        with gr.Column():
            sources_output = gr.Textbox(
                label="Source Chunks",
                lines=10,
                interactive=False
            )
    
    gr.Markdown("### Example URLs to try:")
    gr.Examples(
        examples=[
            ["https://raw.githubusercontent.com/python/cpython/main/README.rst"],
            ["https://docs.python.org/3/tutorial/introduction.html"],
            ["https://raw.githubusercontent.com/huggingface/transformers/main/README.md"],
            ["https://pytorch.org/docs/stable/torch.html"],
        ],
        inputs=url_input
    )
    
    process_btn.click(
        fn=process_documentation,
        inputs=[url_input],
        outputs=[status_output]
    )
    
    ask_btn.click(
        fn=answer_question,
        inputs=[question_input, top_k_slider],
        outputs=[answer_output, sources_output]
    )
    
    question_input.submit(
        fn=answer_question,
        inputs=[question_input, top_k_slider],
        outputs=[answer_output, sources_output]
    )

if __name__ == "__main__":
    demo.launch()