Spaces:

Rahaf2001
/

software-Documentation-RAG-System

Sleeping

App Files Files Community

software-Documentation-RAG-System / app.py

Rahaf2001

Update app.py

cf2feea verified 7 months ago

raw

history blame contribute delete

7.87 kB

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	import numpy as np
	from sentence_transformers import SentenceTransformer
	import faiss
	from typing import List, Tuple
	import re

	model = SentenceTransformer('all-MiniLM-L6-v2')

	doc_chunks = []
	doc_embeddings = None
	index = None
	source_url = ""

	def fetch_documentation(url: str) -> str:
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	'Accept-Encoding': 'gzip, deflate, br',
	'DNT': '1',
	'Connection': 'keep-alive',
	'Upgrade-Insecure-Requests': '1'
	}
	response = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')

	for script in soup(["script", "style", "nav", "footer", "header"]):
	script.decompose()

	text = soup.get_text()

	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	text = '\n'.join(chunk for chunk in chunks if chunk)

	return text
	except Exception as e:
	error_msg = str(e)
	if "403" in error_msg or "Forbidden" in error_msg:
	raise Exception(f"Access denied (403 Forbidden). This website blocks automated requests. Try: 1) Using the site's API if available, 2) A different documentation page, 3) GitHub raw content URLs work well (e.g., https://raw.githubusercontent.com/...)")
	elif "404" in error_msg:
	raise Exception(f"Page not found (404). Please check the URL is correct.")
	elif "timeout" in error_msg.lower():
	raise Exception(f"Request timeout. The website took too long to respond.")
	else:
	raise Exception(f"Error fetching URL: {error_msg}")

	def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
	sentences = re.split(r'[.!?]+', text)
	chunks = []
	current_chunk = ""

	for sentence in sentences:
	sentence = sentence.strip()
	if not sentence:
	continue

	if len(current_chunk) + len(sentence) < chunk_size:
	current_chunk += sentence + ". "
	else:
	if current_chunk:
	chunks.append(current_chunk.strip())
	current_chunk = sentence + ". "

	if current_chunk:
	chunks.append(current_chunk.strip())

	return chunks

	def process_documentation(url: str) -> str:
	global doc_chunks, doc_embeddings, index, source_url

	if not url:
	return "Please provide a URL"

	try:
	status = "Fetching documentation..."
	print(status)

	text = fetch_documentation(url)

	if len(text) < 100:
	return "Retrieved content is too short. Please check the URL."

	status = "Chunking text..."
	print(status)

	doc_chunks = chunk_text(text)

	if not doc_chunks:
	return "No content chunks created. The documentation might be empty."

	status = f"Creating embeddings for {len(doc_chunks)} chunks..."
	print(status)

	doc_embeddings = model.encode(doc_chunks, show_progress_bar=False)

	dimension = doc_embeddings.shape[1]
	index = faiss.IndexFlatL2(dimension)
	index.add(doc_embeddings.astype('float32'))

	source_url = url

	return f"Documentation processed successfully!\n\nStatistics:\n- Chunks created: {len(doc_chunks)}\n- Text length: {len(text)} characters\n- Ready to answer questions!"

	except Exception as e:
	return f"Error: {str(e)}"

	def answer_question(question: str, top_k: int = 3) -> Tuple[str, str]:
	global doc_chunks, doc_embeddings, index, source_url

	if not question:
	return "Please enter a question", ""

	if index is None or not doc_chunks:
	return "Please process documentation first by entering a URL above", ""

	try:
	question_embedding = model.encode([question])

	distances, indices = index.search(question_embedding.astype('float32'), top_k)

	relevant_chunks = [doc_chunks[i] for i in indices[0]]

	context = "\n\n".join([f"[{i+1}] {chunk}" for i, chunk in enumerate(relevant_chunks)])

	answer = f"Based on the documentation at {source_url}:\n\n"
	answer += f"Relevant Information:\n\n{relevant_chunks[0]}"

	if len(relevant_chunks) > 1:
	answer += f"\n\nAdditional Context:\n\n{relevant_chunks[1]}"

	sources = "Retrieved Chunks:\n\n"
	for i, (chunk, dist) in enumerate(zip(relevant_chunks, distances[0])):
	sources += f"Chunk {i+1} (similarity: {1/(1+dist):.3f}):\n{chunk}\n\n---\n\n"

	return answer, sources

	except Exception as e:
	return f"Error: {str(e)}", ""

	with gr.Blocks(theme=gr.themes.Soft(), title="Documentation RAG System") as demo:
	gr.Markdown("# Documentation RAG System\n\nEnter a documentation URL, process it, then ask questions about the content using AI-powered retrieval.")

	with gr.Row():
	with gr.Column():
	url_input = gr.Textbox(
	label="Documentation URL",
	placeholder="https://docs.python.org/3/tutorial/index.html",
	lines=1
	)
	process_btn = gr.Button("Process Documentation", variant="primary")
	status_output = gr.Textbox(
	label="Status",
	lines=6,
	interactive=False
	)

	gr.Markdown("---")

	with gr.Row():
	with gr.Column():
	question_input = gr.Textbox(
	label="Your Question",
	placeholder="What is this documentation about?",
	lines=3
	)

	top_k_slider = gr.Slider(
	minimum=1,
	maximum=5,
	value=3,
	step=1,
	label="Number of chunks to retrieve"
	)

	ask_btn = gr.Button("Ask Question", variant="primary")

	with gr.Row():
	with gr.Column():
	answer_output = gr.Textbox(
	label="Answer",
	lines=10,
	interactive=False
	)

	with gr.Column():
	sources_output = gr.Textbox(
	label="Source Chunks",
	lines=10,
	interactive=False
	)

	gr.Markdown("### Example URLs to try:")
	gr.Examples(
	examples=[
	["https://raw.githubusercontent.com/python/cpython/main/README.rst"],
	["https://docs.python.org/3/tutorial/introduction.html"],
	["https://raw.githubusercontent.com/huggingface/transformers/main/README.md"],
	["https://pytorch.org/docs/stable/torch.html"],
	],
	inputs=url_input
	)

	process_btn.click(
	fn=process_documentation,
	inputs=[url_input],
	outputs=[status_output]
	)

	ask_btn.click(
	fn=answer_question,
	inputs=[question_input, top_k_slider],
	outputs=[answer_output, sources_output]
	)

	question_input.submit(
	fn=answer_question,
	inputs=[question_input, top_k_slider],
	outputs=[answer_output, sources_output]
	)

	if __name__ == "__main__":
	demo.launch()