Spaces:

Rahaf2001
/

software-Documentation-RAG-System

Sleeping

App Files Files Community

Rahaf2001 commited on Oct 22, 2025

Commit

6d0080e

verified ·

1 Parent(s): 29879ca

Create app.py

Browse files

Files changed (1) hide show

app.py +215 -0

app.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import gradio as gr
+import requests
+from bs4 import BeautifulSoup
+import numpy as np
+from sentence_transformers import SentenceTransformer
+import faiss
+from typing import List, Tuple
+import re
+model = SentenceTransformer('all-MiniLM-L6-v2')
+doc_chunks = []
+doc_embeddings = None
+index = None
+source_url = ""
+def fetch_documentation(url: str) -> str:
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+        }
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, 'html.parser')
+        for script in soup(["script", "style", "nav", "footer", "header"]):
+            script.decompose()
+        text = soup.get_text()
+        lines = (line.strip() for line in text.splitlines())
+        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        text = '\n'.join(chunk for chunk in chunks if chunk)
+        return text
+    except Exception as e:
+        raise Exception(f"Error fetching URL: {str(e)}")
+def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
+    sentences = re.split(r'[.!?]+', text)
+    chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        sentence = sentence.strip()
+        if not sentence:
+            continue
+        if len(current_chunk) + len(sentence) < chunk_size:
+            current_chunk += sentence + ". "
+        else:
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+            current_chunk = sentence + ". "
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
+def process_documentation(url: str) -> str:
+    global doc_chunks, doc_embeddings, index, source_url
+    if not url:
+        return "Please provide a URL"
+    try:
+        status = "Fetching documentation..."
+        print(status)
+        text = fetch_documentation(url)
+        if len(text) < 100:
+            return "Retrieved content is too short. Please check the URL."
+        status = "Chunking text..."
+        print(status)
+        doc_chunks = chunk_text(text)
+        if not doc_chunks:
+            return "No content chunks created. The documentation might be empty."
+        status = f"Creating embeddings for {len(doc_chunks)} chunks..."
+        print(status)
+        doc_embeddings = model.encode(doc_chunks, show_progress_bar=False)
+        dimension = doc_embeddings.shape[1]
+        index = faiss.IndexFlatL2(dimension)
+        index.add(doc_embeddings.astype('float32'))
+        source_url = url
+        return f"Documentation processed successfully!\n\nStatistics:\n- Chunks created: {len(doc_chunks)}\n- Text length: {len(text)} characters\n- Ready to answer questions!"
+    except Exception as e:
+        return f"Error: {str(e)}"
+def answer_question(question: str, top_k: int = 3) -> Tuple[str, str]:
+    global doc_chunks, doc_embeddings, index, source_url
+    if not question:
+        return "Please enter a question", ""
+    if index is None or not doc_chunks:
+        return "Please process documentation first by entering a URL above", ""
+    try:
+        question_embedding = model.encode([question])
+        distances, indices = index.search(question_embedding.astype('float32'), top_k)
+        relevant_chunks = [doc_chunks[i] for i in indices[0]]
+        context = "\n\n".join([f"[{i+1}] {chunk}" for i, chunk in enumerate(relevant_chunks)])
+        answer = f"Based on the documentation at {source_url}:\n\n"
+        answer += f"Relevant Information:\n\n{relevant_chunks[0]}"
+        if len(relevant_chunks) > 1:
+            answer += f"\n\nAdditional Context:\n\n{relevant_chunks[1]}"
+        sources = "Retrieved Chunks:\n\n"
+        for i, (chunk, dist) in enumerate(zip(relevant_chunks, distances[0])):
+            sources += f"Chunk {i+1} (similarity: {1/(1+dist):.3f}):\n{chunk}\n\n---\n\n"
+        return answer, sources
+    except Exception as e:
+        return f"Error: {str(e)}", ""
+with gr.Blocks(theme=gr.themes.Soft(), title="Documentation RAG System") as demo:
+    gr.Markdown("# Documentation RAG System\n\nEnter a documentation URL, process it, then ask questions about the content using AI-powered retrieval.")
+    with gr.Row():
+        with gr.Column():
+            url_input = gr.Textbox(
+                label="Documentation URL",
+                placeholder="https://docs.python.org/3/tutorial/index.html",
+                lines=1
+            )
+            process_btn = gr.Button("Process Documentation", variant="primary")
+            status_output = gr.Textbox(
+                label="Status",
+                lines=6,
+                interactive=False
+            )
+    gr.Markdown("---")
+    with gr.Row():
+        with gr.Column():
+            question_input = gr.Textbox(
+                label="Your Question",
+                placeholder="What is this documentation about?",
+                lines=3
+            )
+            top_k_slider = gr.Slider(
+                minimum=1,
+                maximum=5,
+                value=3,
+                step=1,
+                label="Number of chunks to retrieve"
+            )
+            ask_btn = gr.Button("Ask Question", variant="primary")
+    with gr.Row():
+        with gr.Column():
+            answer_output = gr.Textbox(
+                label="Answer",
+                lines=10,
+                interactive=False
+            )
+        with gr.Column():
+            sources_output = gr.Textbox(
+                label="Source Chunks",
+                lines=10,
+                interactive=False
+            )
+    gr.Markdown("### Example URLs to try:")
+    gr.Examples(
+        examples=[
+            ["https://docs.python.org/3/tutorial/introduction.html"],
+            ["https://pytorch.org/docs/stable/torch.html"],
+            ["https://huggingface.co/docs/transformers/quicktour"],
+        ],
+        inputs=url_input
+    )
+    process_btn.click(
+        fn=process_documentation,
+        inputs=[url_input],
+        outputs=[status_output]
+    )
+    ask_btn.click(
+        fn=answer_question,
+        inputs=[question_input, top_k_slider],
+        outputs=[answer_output, sources_output]
+    )
+    question_input.submit(
+        fn=answer_question,
+        inputs=[question_input, top_k_slider],
+        outputs=[answer_output, sources_output]
+    )
+if __name__ == "__main__":
+    demo.launch()