gwfast_bot

Sleeping

App Files Files Community

gwfast_bot / rag.py

jaywadekar

fixed reference extraction

bd402e1 7 months ago

raw

history blame contribute delete

16.3 kB

	# Utilities to build a RAG system to query information from the
	# gwIAS search pipeline using Langchain

	# Thanks to Pablo Villanueva Domingo for sharing his CAMELS template
	# https://huggingface.co/spaces/PabloVD/CAMELSDocBot

	from langchain import hub
	from langchain_chroma import Chroma
	from langchain_core.output_parsers import StrOutputParser
	from langchain_core.runnables import RunnablePassthrough
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_community.document_loaders import WebBaseLoader
	from langchain.schema import Document
	import requests
	import json
	import base64
	from bs4 import BeautifulSoup
	import re
	from urllib.parse import urljoin, urlparse

	def github_to_raw(url):
	"""Convert GitHub URL to raw content URL"""
	return url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/")

	def load_github_notebook(url):
	"""Load Jupyter notebook from GitHub URL using GitHub API"""
	try:
	# Convert GitHub blob URL to API URL
	if "github.com" in url and "/blob/" in url:
	# Extract owner, repo, branch and path from URL
	parts = url.replace("https://github.com/", "").split("/")
	owner = parts[0]
	repo = parts[1]
	branch = parts[3] # usually 'main' or 'master'
	path = "/".join(parts[4:])

	api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}"
	else:
	raise ValueError("URL must be a GitHub blob URL")

	# Fetch notebook content
	response = requests.get(api_url)
	response.raise_for_status()

	content_data = response.json()
	if content_data.get('encoding') == 'base64':
	notebook_content = base64.b64decode(content_data['content']).decode('utf-8')
	else:
	notebook_content = content_data['content']

	# Parse notebook JSON
	notebook = json.loads(notebook_content)

	docs = []
	cell_count = 0

	# Process each cell
	for cell in notebook.get('cells', []):
	cell_count += 1
	cell_type = cell.get('cell_type', 'unknown')
	source = cell.get('source', [])

	# Join source lines
	if isinstance(source, list):
	content = ''.join(source)
	else:
	content = str(source)

	if content.strip(): # Only add non-empty cells
	metadata = {
	'source': url,
	'cell_type': cell_type,
	'cell_number': cell_count,
	'name': f"{url} - Cell {cell_count} ({cell_type})"
	}

	# Add cell type prefix for better context
	formatted_content = f"[{cell_type.upper()} CELL {cell_count}]\n{content}"

	docs.append(Document(page_content=formatted_content, metadata=metadata))

	return docs

	except Exception as e:
	print(f"Error loading notebook from {url}: {str(e)}")
	return []

	def clean_text(text):
	"""Clean text content from a webpage"""
	# Remove excessive newlines
	text = re.sub(r'\n{3,}', '\n\n', text)
	# Remove excessive whitespace
	text = re.sub(r'\s{2,}', ' ', text)
	return text.strip()

	def clean_github_content(html_content):
	"""Extract meaningful content from GitHub pages"""
	# Ensure we're working with a BeautifulSoup object
	if isinstance(html_content, str):
	soup = BeautifulSoup(html_content, 'html.parser')
	else:
	soup = html_content

	# Remove navigation, footer, and other boilerplate
	for element in soup.find_all(['nav', 'footer', 'header']):
	element.decompose()

	# For README and code files
	readme_content = soup.find('article', class_='markdown-body')
	if readme_content:
	return clean_text(readme_content.get_text())

	# For code files
	code_content = soup.find('table', class_='highlight')
	if code_content:
	return clean_text(code_content.get_text())

	# For directory listings
	file_list = soup.find('div', role='grid')
	if file_list:
	return clean_text(file_list.get_text())

	# Fallback to main content
	main_content = soup.find('main')
	if main_content:
	return clean_text(main_content.get_text())

	# If no specific content found, get text from body
	body = soup.find('body')
	if body:
	return clean_text(body.get_text())

	# Final fallback
	return clean_text(soup.get_text())

	class GitHubLoader(WebBaseLoader):
	"""Custom loader for GitHub pages with better content cleaning"""

	def clean_text(self, text):
	"""Clean text content"""
	# Remove excessive newlines and spaces
	text = re.sub(r'\n{2,}', '\n', text)
	text = re.sub(r'\s{2,}', ' ', text)
	# Remove common GitHub boilerplate
	text = re.sub(r'Skip to content\|Sign in\|Search or jump to\|Footer navigation\|Terms\|Privacy\|Security\|Status\|Docs', '', text)
	return text.strip()

	def lazy_load(self) -> list[Document]:
	"""Override lazy_load instead of _scrape to handle both BeautifulSoup and string returns."""
	for url in self.web_paths:
	try:
	response = requests.get(url)
	response.raise_for_status()

	# For directory listings (tree URLs), use the API
	if '/tree/' in url:
	# Parse URL components
	parts = url.replace("https://github.com/", "").split("/")
	owner = parts[0]
	repo = parts[1]
	branch = parts[3] # usually 'main' or 'master'
	path = "/".join(parts[4:]) if len(parts) > 4 else ""

	# Construct API URL
	api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}"
	api_response = requests.get(api_url)
	api_response.raise_for_status()

	# Parse directory listing
	contents = api_response.json()
	if isinstance(contents, list):
	# Format directory contents
	content = "Directory contents:\n" + "\n".join([f"{item['name']} ({item['type']})" for item in contents])
	yield Document(
	page_content=self.clean_text(content),
	metadata={'source': url, 'type': 'github_directory'}
	)
	continue

	# For regular files, parse HTML
	soup = BeautifulSoup(response.text, 'html.parser')

	# For README and markdown files
	readme_content = soup.find('article', class_='markdown-body')
	if readme_content:
	yield Document(
	page_content=self.clean_text(readme_content.get_text()),
	metadata={'source': url, 'type': 'github_markdown'}
	)
	continue

	# For code files
	code_content = soup.find('table', class_='highlight')
	if code_content:
	yield Document(
	page_content=self.clean_text(code_content.get_text()),
	metadata={'source': url, 'type': 'github_code'}
	)
	continue

	# For other content, get main content
	main_content = soup.find('main')
	if main_content:
	yield Document(
	page_content=self.clean_text(main_content.get_text()),
	metadata={'source': url, 'type': 'github_other'}
	)
	continue

	# Fallback to whole page content
	yield Document(
	page_content=self.clean_text(soup.get_text()),
	metadata={'source': url, 'type': 'github_fallback'}
	)

	except Exception as e:
	print(f"Error processing {url}: {str(e)}")
	continue

	def load(self) -> list[Document]:
	"""Load method that returns a list of documents."""
	return list(self.lazy_load())

	class ReadTheDocsLoader(WebBaseLoader):
	"""Custom loader for ReadTheDocs pages"""

	def __init__(self, base_url: str):
	"""Initialize with base URL of the documentation."""
	super().__init__([])
	self.base_url = base_url.rstrip('/')

	def clean_text(self, text: str) -> str:
	"""Clean text content from ReadTheDocs pages."""
	# Remove excessive whitespace and newlines
	text = re.sub(r'\s{2,}', ' ', text)
	text = re.sub(r'\n{3,}', '\n\n', text)
	# Remove common ReadTheDocs boilerplate
	text = re.sub(r'View page source\|Next\|Previous\|©.*?\.', '', text)
	return text.strip()

	def normalize_url(self, base_url: str, href: str) -> str:
	"""Normalize relative URLs to absolute URLs."""
	# If it's already an absolute URL, return it
	if href.startswith(('http://', 'https://')):
	return href

	# Handle relative URLs
	return urljoin(base_url, href)

	def get_all_pages(self) -> list[str]:
	"""Get all documentation pages starting from the base URL."""
	visited = set()
	to_visit = {self.base_url}
	docs_urls = set()

	while to_visit:
	url = to_visit.pop()
	if url in visited:
	continue

	visited.add(url)
	try:
	response = requests.get(url)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, 'html.parser')

	# Add current page if it's a documentation page
	if url.startswith(self.base_url):
	docs_urls.add(url)

	# Find all links
	for link in soup.find_all('a'):
	href = link.get('href')
	if not href:
	continue

	# Skip anchor links and external links
	if href.startswith('#') or href.startswith(('http://', 'https://')) and not href.startswith(self.base_url):
	continue

	# Normalize the URL
	full_url = self.normalize_url(url, href)

	# Only follow links within the documentation domain
	if full_url.startswith(self.base_url):
	to_visit.add(full_url)

	except Exception as e:
	print(f"Error fetching {url}: {str(e)}")

	return list(docs_urls)

	def load(self) -> list[Document]:
	"""Load all documentation pages."""
	urls = self.get_all_pages()
	docs = []

	for url in urls:
	try:
	response = requests.get(url)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, 'html.parser')

	# Get main content
	main_content = soup.find('div', {'role': 'main'})
	if not main_content:
	main_content = soup.find('main')
	if not main_content:
	continue

	# Clean content
	content = self.clean_text(main_content.get_text())
	if content:
	docs.append(Document(
	page_content=content,
	metadata={'source': url, 'type': 'readthedocs'}
	))

	except Exception as e:
	print(f"Error processing {url}: {str(e)}")

	return docs

	def load_docs():
	"""Load all documentation."""
	# Get urls
	with open("urls.txt", "r") as f:
	urls = [line.strip() for line in f.readlines()]

	docs = []

	# Load GitHub content
	for url in urls:
	if "github.com" in url or "raw.githubusercontent.com" in url:
	if "/blob/" in url and url.endswith(".ipynb"):
	# Handle Jupyter notebooks
	notebook_docs = load_github_notebook(url)
	docs.extend(notebook_docs)
	elif "raw.githubusercontent.com" in url:
	# Handle raw GitHub content directly
	try:
	response = requests.get(url)
	response.raise_for_status()
	content = response.text
	docs.append(Document(
	page_content=content,
	metadata={'source': url, 'type': 'github_raw'}
	))
	except Exception as e:
	print(f"Error loading raw content from {url}: {str(e)}")
	else:
	# Handle other GitHub content
	loader = GitHubLoader([url])
	docs.extend(loader.load())

	# Load ReadTheDocs content
	rtd_loader = ReadTheDocsLoader("https://gwfast.readthedocs.io/en/latest")
	docs.extend(rtd_loader.load())

	return docs

	def extract_reference(url):
	"""Extract a reference keyword from the GitHub URL"""
	if "blob/main" in url:
	return url.split("blob/main/")[-1]
	elif "tree/main" in url:
	return url.split("tree/main/")[-1] or "root"
	elif "blob/master" in url:
	return url.split("blob/master/")[-1]
	elif "tree/master" in url:
	return url.split("tree/master/")[-1] or "root"
	elif "refs/heads/master" in url:
	return url.split("refs/heads/master/")[-1]
	return url

	# Join content pages for processing
	def format_docs(docs):
	formatted_docs = []
	for doc in docs:
	source = doc.metadata.get('source', 'Unknown source')
	reference = f"[{extract_reference(source)}]"
	content = doc.page_content
	formatted_docs.append(f"{content}\n\nReference: {reference}")
	return "\n\n---\n\n".join(formatted_docs)

	# Create a RAG chain
	def RAG(llm, docs, embeddings):

	# Split text
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
	splits = text_splitter.split_documents(docs)

	# Create vector store
	vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)

	# Retrieve and generate using the relevant snippets of the documents
	retriever = vectorstore.as_retriever()

	# Prompt basis example for RAG systems
	prompt = hub.pull("rlm/rag-prompt")
	# Adding custom instructions to the prompt
	template = prompt.messages[0].prompt.template
	template_parts = template.split("\nQuestion: {question}")
	combined_template = "You are an assistant for question-answering tasks. "\
	+ "Use the following pieces of retrieved context to answer the question. "\
	+ "If you don't know the answer, just say that you don't know. "\
	+ "Try to keep the answer concise if possible. "\
	+ "Write the names of the relevant functions from the retrived code and include code snippets to aid the user's understanding. "\
	+ "Include the references used in square brackets at the end of your answer."\
	+ template_parts[1]
	prompt.messages[0].prompt.template = combined_template

	# Create the chain
	rag_chain = (
	{"context": retriever \| format_docs, "question": RunnablePassthrough()}
	\| prompt
	\| llm
	\| StrOutputParser()
	)

	return rag_chain