Rahaf2001's picture
Update app.py
cf2feea verified
import gradio as gr
import requests
from bs4 import BeautifulSoup
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from typing import List, Tuple
import re
model = SentenceTransformer('all-MiniLM-L6-v2')
doc_chunks = []
doc_embeddings = None
index = None
source_url = ""
def fetch_documentation(url: str) -> str:
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
response = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
for script in soup(["script", "style", "nav", "footer", "header"]):
script.decompose()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
except Exception as e:
error_msg = str(e)
if "403" in error_msg or "Forbidden" in error_msg:
raise Exception(f"Access denied (403 Forbidden). This website blocks automated requests. Try: 1) Using the site's API if available, 2) A different documentation page, 3) GitHub raw content URLs work well (e.g., https://raw.githubusercontent.com/...)")
elif "404" in error_msg:
raise Exception(f"Page not found (404). Please check the URL is correct.")
elif "timeout" in error_msg.lower():
raise Exception(f"Request timeout. The website took too long to respond.")
else:
raise Exception(f"Error fetching URL: {error_msg}")
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
sentences = re.split(r'[.!?]+', text)
chunks = []
current_chunk = ""
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
if len(current_chunk) + len(sentence) < chunk_size:
current_chunk += sentence + ". "
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence + ". "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
def process_documentation(url: str) -> str:
global doc_chunks, doc_embeddings, index, source_url
if not url:
return "Please provide a URL"
try:
status = "Fetching documentation..."
print(status)
text = fetch_documentation(url)
if len(text) < 100:
return "Retrieved content is too short. Please check the URL."
status = "Chunking text..."
print(status)
doc_chunks = chunk_text(text)
if not doc_chunks:
return "No content chunks created. The documentation might be empty."
status = f"Creating embeddings for {len(doc_chunks)} chunks..."
print(status)
doc_embeddings = model.encode(doc_chunks, show_progress_bar=False)
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(doc_embeddings.astype('float32'))
source_url = url
return f"Documentation processed successfully!\n\nStatistics:\n- Chunks created: {len(doc_chunks)}\n- Text length: {len(text)} characters\n- Ready to answer questions!"
except Exception as e:
return f"Error: {str(e)}"
def answer_question(question: str, top_k: int = 3) -> Tuple[str, str]:
global doc_chunks, doc_embeddings, index, source_url
if not question:
return "Please enter a question", ""
if index is None or not doc_chunks:
return "Please process documentation first by entering a URL above", ""
try:
question_embedding = model.encode([question])
distances, indices = index.search(question_embedding.astype('float32'), top_k)
relevant_chunks = [doc_chunks[i] for i in indices[0]]
context = "\n\n".join([f"[{i+1}] {chunk}" for i, chunk in enumerate(relevant_chunks)])
answer = f"Based on the documentation at {source_url}:\n\n"
answer += f"Relevant Information:\n\n{relevant_chunks[0]}"
if len(relevant_chunks) > 1:
answer += f"\n\nAdditional Context:\n\n{relevant_chunks[1]}"
sources = "Retrieved Chunks:\n\n"
for i, (chunk, dist) in enumerate(zip(relevant_chunks, distances[0])):
sources += f"Chunk {i+1} (similarity: {1/(1+dist):.3f}):\n{chunk}\n\n---\n\n"
return answer, sources
except Exception as e:
return f"Error: {str(e)}", ""
with gr.Blocks(theme=gr.themes.Soft(), title="Documentation RAG System") as demo:
gr.Markdown("# Documentation RAG System\n\nEnter a documentation URL, process it, then ask questions about the content using AI-powered retrieval.")
with gr.Row():
with gr.Column():
url_input = gr.Textbox(
label="Documentation URL",
placeholder="https://docs.python.org/3/tutorial/index.html",
lines=1
)
process_btn = gr.Button("Process Documentation", variant="primary")
status_output = gr.Textbox(
label="Status",
lines=6,
interactive=False
)
gr.Markdown("---")
with gr.Row():
with gr.Column():
question_input = gr.Textbox(
label="Your Question",
placeholder="What is this documentation about?",
lines=3
)
top_k_slider = gr.Slider(
minimum=1,
maximum=5,
value=3,
step=1,
label="Number of chunks to retrieve"
)
ask_btn = gr.Button("Ask Question", variant="primary")
with gr.Row():
with gr.Column():
answer_output = gr.Textbox(
label="Answer",
lines=10,
interactive=False
)
with gr.Column():
sources_output = gr.Textbox(
label="Source Chunks",
lines=10,
interactive=False
)
gr.Markdown("### Example URLs to try:")
gr.Examples(
examples=[
["https://raw.githubusercontent.com/python/cpython/main/README.rst"],
["https://docs.python.org/3/tutorial/introduction.html"],
["https://raw.githubusercontent.com/huggingface/transformers/main/README.md"],
["https://pytorch.org/docs/stable/torch.html"],
],
inputs=url_input
)
process_btn.click(
fn=process_documentation,
inputs=[url_input],
outputs=[status_output]
)
ask_btn.click(
fn=answer_question,
inputs=[question_input, top_k_slider],
outputs=[answer_output, sources_output]
)
question_input.submit(
fn=answer_question,
inputs=[question_input, top_k_slider],
outputs=[answer_output, sources_output]
)
if __name__ == "__main__":
demo.launch()