| import gradio as gr |
| import requests |
| from bs4 import BeautifulSoup |
| import numpy as np |
| from sentence_transformers import SentenceTransformer |
| import faiss |
| from typing import List, Tuple |
| import re |
|
|
| model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
| doc_chunks = [] |
| doc_embeddings = None |
| index = None |
| source_url = "" |
|
|
| def fetch_documentation(url: str) -> str: |
| try: |
| headers = { |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', |
| 'Accept-Language': 'en-US,en;q=0.5', |
| 'Accept-Encoding': 'gzip, deflate, br', |
| 'DNT': '1', |
| 'Connection': 'keep-alive', |
| 'Upgrade-Insecure-Requests': '1' |
| } |
| response = requests.get(url, headers=headers, timeout=15, allow_redirects=True) |
| response.raise_for_status() |
| |
| soup = BeautifulSoup(response.content, 'html.parser') |
| |
| for script in soup(["script", "style", "nav", "footer", "header"]): |
| script.decompose() |
| |
| text = soup.get_text() |
| |
| lines = (line.strip() for line in text.splitlines()) |
| chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) |
| text = '\n'.join(chunk for chunk in chunks if chunk) |
| |
| return text |
| except Exception as e: |
| error_msg = str(e) |
| if "403" in error_msg or "Forbidden" in error_msg: |
| raise Exception(f"Access denied (403 Forbidden). This website blocks automated requests. Try: 1) Using the site's API if available, 2) A different documentation page, 3) GitHub raw content URLs work well (e.g., https://raw.githubusercontent.com/...)") |
| elif "404" in error_msg: |
| raise Exception(f"Page not found (404). Please check the URL is correct.") |
| elif "timeout" in error_msg.lower(): |
| raise Exception(f"Request timeout. The website took too long to respond.") |
| else: |
| raise Exception(f"Error fetching URL: {error_msg}") |
|
|
| def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]: |
| sentences = re.split(r'[.!?]+', text) |
| chunks = [] |
| current_chunk = "" |
| |
| for sentence in sentences: |
| sentence = sentence.strip() |
| if not sentence: |
| continue |
| |
| if len(current_chunk) + len(sentence) < chunk_size: |
| current_chunk += sentence + ". " |
| else: |
| if current_chunk: |
| chunks.append(current_chunk.strip()) |
| current_chunk = sentence + ". " |
| |
| if current_chunk: |
| chunks.append(current_chunk.strip()) |
| |
| return chunks |
|
|
| def process_documentation(url: str) -> str: |
| global doc_chunks, doc_embeddings, index, source_url |
| |
| if not url: |
| return "Please provide a URL" |
| |
| try: |
| status = "Fetching documentation..." |
| print(status) |
| |
| text = fetch_documentation(url) |
| |
| if len(text) < 100: |
| return "Retrieved content is too short. Please check the URL." |
| |
| status = "Chunking text..." |
| print(status) |
| |
| doc_chunks = chunk_text(text) |
| |
| if not doc_chunks: |
| return "No content chunks created. The documentation might be empty." |
| |
| status = f"Creating embeddings for {len(doc_chunks)} chunks..." |
| print(status) |
| |
| doc_embeddings = model.encode(doc_chunks, show_progress_bar=False) |
| |
| dimension = doc_embeddings.shape[1] |
| index = faiss.IndexFlatL2(dimension) |
| index.add(doc_embeddings.astype('float32')) |
| |
| source_url = url |
| |
| return f"Documentation processed successfully!\n\nStatistics:\n- Chunks created: {len(doc_chunks)}\n- Text length: {len(text)} characters\n- Ready to answer questions!" |
| |
| except Exception as e: |
| return f"Error: {str(e)}" |
|
|
| def answer_question(question: str, top_k: int = 3) -> Tuple[str, str]: |
| global doc_chunks, doc_embeddings, index, source_url |
| |
| if not question: |
| return "Please enter a question", "" |
| |
| if index is None or not doc_chunks: |
| return "Please process documentation first by entering a URL above", "" |
| |
| try: |
| question_embedding = model.encode([question]) |
| |
| distances, indices = index.search(question_embedding.astype('float32'), top_k) |
| |
| relevant_chunks = [doc_chunks[i] for i in indices[0]] |
| |
| context = "\n\n".join([f"[{i+1}] {chunk}" for i, chunk in enumerate(relevant_chunks)]) |
| |
| answer = f"Based on the documentation at {source_url}:\n\n" |
| answer += f"Relevant Information:\n\n{relevant_chunks[0]}" |
| |
| if len(relevant_chunks) > 1: |
| answer += f"\n\nAdditional Context:\n\n{relevant_chunks[1]}" |
| |
| sources = "Retrieved Chunks:\n\n" |
| for i, (chunk, dist) in enumerate(zip(relevant_chunks, distances[0])): |
| sources += f"Chunk {i+1} (similarity: {1/(1+dist):.3f}):\n{chunk}\n\n---\n\n" |
| |
| return answer, sources |
| |
| except Exception as e: |
| return f"Error: {str(e)}", "" |
|
|
| with gr.Blocks(theme=gr.themes.Soft(), title="Documentation RAG System") as demo: |
| gr.Markdown("# Documentation RAG System\n\nEnter a documentation URL, process it, then ask questions about the content using AI-powered retrieval.") |
| |
| with gr.Row(): |
| with gr.Column(): |
| url_input = gr.Textbox( |
| label="Documentation URL", |
| placeholder="https://docs.python.org/3/tutorial/index.html", |
| lines=1 |
| ) |
| process_btn = gr.Button("Process Documentation", variant="primary") |
| status_output = gr.Textbox( |
| label="Status", |
| lines=6, |
| interactive=False |
| ) |
| |
| gr.Markdown("---") |
| |
| with gr.Row(): |
| with gr.Column(): |
| question_input = gr.Textbox( |
| label="Your Question", |
| placeholder="What is this documentation about?", |
| lines=3 |
| ) |
| |
| top_k_slider = gr.Slider( |
| minimum=1, |
| maximum=5, |
| value=3, |
| step=1, |
| label="Number of chunks to retrieve" |
| ) |
| |
| ask_btn = gr.Button("Ask Question", variant="primary") |
| |
| with gr.Row(): |
| with gr.Column(): |
| answer_output = gr.Textbox( |
| label="Answer", |
| lines=10, |
| interactive=False |
| ) |
| |
| with gr.Column(): |
| sources_output = gr.Textbox( |
| label="Source Chunks", |
| lines=10, |
| interactive=False |
| ) |
| |
| gr.Markdown("### Example URLs to try:") |
| gr.Examples( |
| examples=[ |
| ["https://raw.githubusercontent.com/python/cpython/main/README.rst"], |
| ["https://docs.python.org/3/tutorial/introduction.html"], |
| ["https://raw.githubusercontent.com/huggingface/transformers/main/README.md"], |
| ["https://pytorch.org/docs/stable/torch.html"], |
| ], |
| inputs=url_input |
| ) |
| |
| process_btn.click( |
| fn=process_documentation, |
| inputs=[url_input], |
| outputs=[status_output] |
| ) |
| |
| ask_btn.click( |
| fn=answer_question, |
| inputs=[question_input, top_k_slider], |
| outputs=[answer_output, sources_output] |
| ) |
| |
| question_input.submit( |
| fn=answer_question, |
| inputs=[question_input, top_k_slider], |
| outputs=[answer_output, sources_output] |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|