Jonglee's picture
Change a way of loading a data
40510e4
# Documentation Process
# clone https://github.com/TheMITTech/shakespeare
import subprocess
subprocess.run(["git", "clone", "https://github.com/TheMITTech/shakespeare.git"])
#
from glob import glob
files = glob("./shakespeare/**/*.html")
# Documents copy and move
import shutil
import os
os.mkdir('./data')
destination_folder = './data/'
for html_file in files:
shutil.move(html_file, destination_folder + html_file.split("/")[-1])
# Documents Read with BueatifulSoup
from langchain.document_loaders import BSHTMLLoader, DirectoryLoader
bshtml_dir_loader = DirectoryLoader("./data", loader_cls=BSHTMLLoader)### YOUR CODE HERE
data = bshtml_dir_loader.load()
# Prepare tokenization process for the documents
from transformers import AutoTokenizer
bloomz_tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-1b7")
# Now that we have our tokenizer - let's split our documents into bitesized pieces! Let's split our documents on the newline character!
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(bloomz_tokenizer, chunk_size=100, chunk_overlap=0, separator='\n')
documents = text_splitter.split_documents(data)
# Set an embedding for the documents
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings()
# Store them onto Chroma DB
from langchain.vectorstores import Chroma
persist_directory = "vector_db"
vectordb = Chroma.from_documents(documents, embedding = embeddings, persist_directory=persist_directory)
#
vectordb.persist()
vectordb = None
#
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
# First up, let's load our model!
from langchain import HuggingFacePipeline
llm = HuggingFacePipeline.from_model_id(
model_id="bigscience/bloomz-1b7",
task="text-generation",
model_kwargs={"temperature" : 0, "max_length" : 500})
# Now let's set up our document vector store as a Retriever tool so we can leverage it in our chain!
doc_retriever = vectordb.as_retriever()
# Combine them all
from langchain.chains import RetrievalQA
shakespeare_qa = RetrievalQA.from_chain_type(llm = llm, chain_type = "stuff", retriever=doc_retriever)
def make_answer(query):
return shakespeare_qa.run(query)
if __name__ == "__main__":
# Make a gradio interface
import gradio as gr
gr.Interface(
make_answer,
[gr.inputs.Textbox(lines = 2, label = "Question")],
gr.outputs.Textbox(label = "Answer"),
title = "GenerativeQA",
description = "GenerativeQA is a question and answer model.",
).launch()