Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from langchain_community.document_loaders import WebBaseLoader | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_community.vectorstores import Chroma | |
| from langchain_community import embeddings | |
| from langchain_community.chat_models import ChatOllama | |
| from langchain_core.runnables import RunnablePassthrough | |
| from langchain_core.output_parsers import StrOutputParser | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from langchain.output_parsers import PydanticOutputParser | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain.embeddings.openai import OpenAIEmbeddings | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.vectorstores import Chroma | |
| from langchain_community.llms import HuggingFaceHub | |
| from getpass import getpass | |
| import os | |
| HUGGINGFACEHUB_API_TOKEN = os.getenv("hf_api") | |
| os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN | |
| def process_input(urls, question): | |
| # model_local = ChatOllama(model="mistral") | |
| repo_id = "mistralai/Mistral-7B-Instruct-v0.2" | |
| llm = HuggingFaceHub( | |
| repo_id=repo_id, | |
| ) | |
| model_local = llm | |
| # Convert string of URLs to list | |
| # urls_list = urls.split("\n") | |
| if urls: | |
| urls_list = urls.split("\n") | |
| else: | |
| urls_list = [ | |
| "https://ollama.com/", | |
| "https://ollama.com/blog/windows-preview", | |
| "https://ollama.com/blog/openai-compatibility", | |
| ] | |
| link_list_path = "./shef_extracted_links.txt" | |
| with open(link_list_path, "r") as f: | |
| link_list = [l.strip() for l in f.readlines()] | |
| urls_list += link_list | |
| docs = [WebBaseLoader(url).load() for url in urls_list] | |
| docs_list = [item for sublist in docs for item in sublist] | |
| # print(docs_list) | |
| loader = PyPDFLoader("./doc.pdf") | |
| pages = loader.load_and_split() | |
| text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100) | |
| doc_init = text_splitter.split_documents(pages) | |
| doc_splits = text_splitter.split_documents(docs_list) | |
| doc_new = doc_init + doc_splits | |
| # doc_new = doc_splits | |
| vectorstore = Chroma.from_documents( | |
| documents=doc_new, | |
| collection_name="rag-chroma", | |
| embedding=embeddings.ollama.OllamaEmbeddings(model='nomic-embed-text'), | |
| ) | |
| retriever = vectorstore.as_retriever() | |
| # after_rag_template = """Answer the question based only on the following context: | |
| # {context} | |
| # Question: {question} | |
| # """ | |
| after_rag_template = """ Using the contexts below, answer the question. | |
| contexts: | |
| {context} | |
| Question: {question} | |
| """ | |
| after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template) | |
| after_rag_chain = ( | |
| {"context": lambda x:retriever, "question": RunnablePassthrough()} | |
| | after_rag_prompt | |
| | model_local | |
| | StrOutputParser() | |
| ) | |
| return after_rag_chain.invoke(question).split(f'Question: {question}')[1] | |
| # Define Gradio interface | |
| iface = gr.Interface(fn=process_input, | |
| inputs=[gr.Textbox(label="Enter URLs separated by new lines"), gr.Textbox(label="Question")], | |
| outputs="text", | |
| title="ChatAcadamy with Mistral", | |
| description="Enter URLs and a question to query the documents.") | |
| iface.launch() |