Spaces:

deepset
/

search-all-the-docs

Build error

App Files Files Community

search-all-the-docs / main.py

silvanocerza

First commit

c5936cd about 2 years ago

raw

history blame

4.43 kB

	from typing import List, Tuple
	from pathlib import Path
	import subprocess
	import os

	from dotenv import load_dotenv
	from haystack.preview import Pipeline
	from haystack.preview.components.retrievers import MemoryBM25Retriever
	from haystack.preview.components.generators.openai.gpt import GPTGenerator
	from haystack.preview.components.builders.prompt_builder import PromptBuilder
	from haystack.preview.components.preprocessors import (
	DocumentCleaner,
	TextDocumentSplitter,
	)
	from haystack.preview.components.writers import DocumentWriter
	from haystack.preview.components.file_converters import TextFileToDocument
	from haystack.preview.document_stores.memory import MemoryDocumentStore
	import streamlit as st

	# Load the environment variables, we're going to need it for OpenAI
	load_dotenv()

	# This is the list of documentation that we're going to fetch
	DOCUMENTATIONS = [
	("docarray", "https://github.com/docarray/docarray.git", "./docs/*/.md"),
	("streamlit", "https://github.com/streamlit/docs.git", "./content/*/.md"),
	("jinja", "https://github.com/pallets/jinja.git", "./docs/*/.rst"),
	("pandas", "https://github.com/pandas-dev/pandas.git", "./docs/source/*/.rst"),
	(
	"elasticsearch",
	"https://github.com/elastic/elasticsearch.git",
	"./docs/*/.asciidoc",
	),
	]


	@st.cache_data(show_spinner=False)
	def fetch(documentations: List[Tuple[str, str, str]]):
	paths = []
	for name, url, pattern in documentations:
	st.write(f"Fetching {name} repository")
	repo = Path(__file__).parent / "downloaded_docs" / name
	if not repo.exists():
	subprocess.run(["git", "clone", "--depth", "1", url, str(repo)], check=True)
	paths.extend(repo.glob(pattern))

	return paths


	@st.cache_resource
	def document_store():
	# We're going to store the processed documents in here
	return MemoryDocumentStore()


	@st.cache_resource
	def index_files(files):
	# We create some components
	text_converter = TextFileToDocument(progress_bar=False)
	document_cleaner = DocumentCleaner()
	document_splitter = TextDocumentSplitter()
	document_writer = DocumentWriter(
	document_store=document_store(), policy="overwrite"
	)

	# And our pipeline
	indexing_pipeline = Pipeline()
	indexing_pipeline.add_component("converter", text_converter)
	indexing_pipeline.add_component("cleaner", document_cleaner)
	indexing_pipeline.add_component("splitter", document_splitter)
	indexing_pipeline.add_component("writer", document_writer)
	indexing_pipeline.connect("converter", "cleaner")
	indexing_pipeline.connect("cleaner", "splitter")
	indexing_pipeline.connect("splitter", "writer")

	# And now we clone and save the documentation in our MemoryDocumentStore
	indexing_pipeline.run({"converter": {"paths": files}})


	def search(question: str) -> str:
	retriever = MemoryBM25Retriever(document_store=document_store(), top_k=10)

	template = """Take a deep breath and think then answer given the context
	Context: {{ documents\|map(attribute='text')\|join('\n') }}
	Question: {{ question }}
	Answer:
	"""
	prompt_builder = PromptBuilder(template)

	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
	generator = GPTGenerator(api_key=OPENAI_API_KEY)

	pipe = Pipeline()

	pipe.add_component("docs_retriever", retriever)
	pipe.add_component("builder", prompt_builder)
	pipe.add_component("gpt35", generator)

	pipe.connect("docs_retriever.documents", "builder.documents")
	pipe.connect("builder.prompt", "gpt35.prompt")
	res = pipe.run(
	{
	"docs_retriever": {"query": question},
	"builder": {"question": question},
	}
	)
	return res["gpt35"]["replies"][0]


	with st.status(
	"Downloading documentation files...",
	expanded=st.session_state.get("expanded", True),
	) as status:
	files = fetch(DOCUMENTATIONS)
	status.update(label="Indexing documentation...")
	index_files(files)
	status.update(
	label="Download and indexing complete!", state="complete", expanded=False
	)
	st.session_state["expanded"] = False


	st.header("Documentation finder", divider="rainbow")


	if question := st.text_input(
	label="What do you need to know?", placeholder="What is a DataFrame?"
	):
	with st.spinner("Waiting"):
	answer = search(question)

	st.balloons()
	st.markdown(answer)