Spaces:
Sleeping
Sleeping
Initial commit for whole code
Browse files- .gitattributes +1 -0
- Langchain Document Helper Architecture.png +3 -0
- Pipfile +26 -0
- Pipfile.lock +0 -0
- app.py +85 -0
- backend/core.py +34 -0
- consts.py +1 -0
- ingestion.py +42 -0
- requirements.txt +110 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
Langchain[[:space:]]Document[[:space:]]Helper[[:space:]]Architecture.png filter=lfs diff=lfs merge=lfs -text
|
Langchain Document Helper Architecture.png
ADDED
|
Git LFS Details
|
Pipfile
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[[source]]
|
| 2 |
+
url = "https://pypi.org/simple"
|
| 3 |
+
verify_ssl = true
|
| 4 |
+
name = "pypi"
|
| 5 |
+
|
| 6 |
+
[packages]
|
| 7 |
+
langchain = "*"
|
| 8 |
+
beautifulsoup4 = "*"
|
| 9 |
+
black = "*"
|
| 10 |
+
tiktoken = "*"
|
| 11 |
+
openai = "*"
|
| 12 |
+
pinecone-client = "*"
|
| 13 |
+
unstructured = "*"
|
| 14 |
+
nltk = "*"
|
| 15 |
+
fastapi = "*"
|
| 16 |
+
jinja2 = "*"
|
| 17 |
+
uvicorn = "*"
|
| 18 |
+
streamlit = "*"
|
| 19 |
+
streamlit-chat = "*"
|
| 20 |
+
tqdm = "*"
|
| 21 |
+
|
| 22 |
+
[dev-packages]
|
| 23 |
+
|
| 24 |
+
[requires]
|
| 25 |
+
python_version = "3.11"
|
| 26 |
+
python_full_version = "3.11.0"
|
Pipfile.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
app.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Set
|
| 2 |
+
|
| 3 |
+
from backend.core import run_llm
|
| 4 |
+
import streamlit as st
|
| 5 |
+
from streamlit_chat import message
|
| 6 |
+
from PIL import Image
|
| 7 |
+
from io import BytesIO
|
| 8 |
+
import base64
|
| 9 |
+
|
| 10 |
+
# def add_bg_from_local(image_file):
|
| 11 |
+
# with open(image_file, "rb") as image_file:
|
| 12 |
+
# encoded_string = base64.b64encode(image_file.read())
|
| 13 |
+
# st.markdown(
|
| 14 |
+
# f"""
|
| 15 |
+
# <style>
|
| 16 |
+
# .stApp {{
|
| 17 |
+
# background-image: url(data:{"jpeg"};base64,{encoded_string.decode()});
|
| 18 |
+
# background-size: cover
|
| 19 |
+
# }}
|
| 20 |
+
# </style>
|
| 21 |
+
# """,
|
| 22 |
+
# unsafe_allow_html=True
|
| 23 |
+
# )
|
| 24 |
+
# background_image = "bg2.jpeg"
|
| 25 |
+
# add_bg_from_local(background_image)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
st.header("LangChain 🦜🔗 Documentation - Helper ChatBot")
|
| 29 |
+
|
| 30 |
+
if "user_prompt_history" not in st.session_state:
|
| 31 |
+
st.session_state["user_prompt_history"] = []
|
| 32 |
+
|
| 33 |
+
if "chat_answers_history" not in st.session_state:
|
| 34 |
+
st.session_state["chat_answers_history"] = []
|
| 35 |
+
|
| 36 |
+
if "chat_history" not in st.session_state:
|
| 37 |
+
st.session_state["chat_history"] = []
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def create_sources_string(source_urls: Set[str]) -> str:
|
| 41 |
+
if not source_urls:
|
| 42 |
+
return ""
|
| 43 |
+
sources_list = list(source_urls)
|
| 44 |
+
sources_list.sort()
|
| 45 |
+
sources_string = "sources:\n"
|
| 46 |
+
for i, source in enumerate(sources_list):
|
| 47 |
+
sources_string += f"{i+1}. {source}\n"
|
| 48 |
+
return sources_string
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
prompt = st.text_input("Prompt", placeholder="Enter your message here...")
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
if prompt:
|
| 55 |
+
with st.spinner("Generating response..."):
|
| 56 |
+
generated_response = run_llm(
|
| 57 |
+
query=prompt, chat_history=st.session_state["chat_history"]
|
| 58 |
+
)
|
| 59 |
+
sources = set(
|
| 60 |
+
[doc.metadata["source"] for doc in generated_response["source_documents"]]
|
| 61 |
+
)
|
| 62 |
+
formatted_response = (
|
| 63 |
+
f"{generated_response['answer']} \n\n {create_sources_string(sources)}"
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
st.session_state.user_prompt_history.append(prompt)
|
| 67 |
+
st.session_state.chat_answers_history.append(formatted_response)
|
| 68 |
+
st.session_state.chat_history.append((prompt, generated_response["answer"]))
|
| 69 |
+
|
| 70 |
+
if st.session_state["chat_answers_history"]:
|
| 71 |
+
for generated_response, user_query in zip(
|
| 72 |
+
st.session_state["chat_answers_history"],
|
| 73 |
+
st.session_state["user_prompt_history"],
|
| 74 |
+
):
|
| 75 |
+
message(
|
| 76 |
+
user_query,
|
| 77 |
+
is_user=True,
|
| 78 |
+
avatar_style="adventurer",
|
| 79 |
+
seed=123,
|
| 80 |
+
)
|
| 81 |
+
# message(generated_response)
|
| 82 |
+
st.write(
|
| 83 |
+
f'<div style="word-wrap: break-word;">{generated_response}</div>',
|
| 84 |
+
unsafe_allow_html=True,
|
| 85 |
+
)
|
backend/core.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import Any, Dict, List
|
| 3 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
| 4 |
+
from langchain.chat_models import ChatOpenAI
|
| 5 |
+
from langchain.chains import RetrievalQA
|
| 6 |
+
from langchain.chains import ConversationalRetrievalChain
|
| 7 |
+
from langchain.vectorstores import Pinecone
|
| 8 |
+
import pinecone
|
| 9 |
+
|
| 10 |
+
from dotenv import load_dotenv
|
| 11 |
+
|
| 12 |
+
from consts import INDEX_NAME
|
| 13 |
+
load_dotenv()
|
| 14 |
+
|
| 15 |
+
pinecone.init(
|
| 16 |
+
api_key=os.environ.get("PINECONE_API_KEY"),
|
| 17 |
+
environment=os.environ.get("PINECONE_ENVIRONMENT_REGION"),
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
def run_llm(query, chat_history):
|
| 21 |
+
embeddings = OpenAIEmbeddings()
|
| 22 |
+
docSearch = Pinecone.from_existing_index(index_name = INDEX_NAME, embedding = embeddings)
|
| 23 |
+
chat = ChatOpenAI(verbose = True, temperature = 0)
|
| 24 |
+
|
| 25 |
+
#qa = RetrievalQA.from_chain_type(llm = chat,chain_type = "stuff", retriever = docSearch.as_retriever(), return_source_documents = True)
|
| 26 |
+
|
| 27 |
+
qa = ConversationalRetrievalChain.from_llm(llm = chat, retriever = docSearch.as_retriever(), return_source_documents = True)
|
| 28 |
+
|
| 29 |
+
return qa({"question" : query, "chat_history" : chat_history})
|
| 30 |
+
|
| 31 |
+
# if __name__ == "__main__":
|
| 32 |
+
# print(run_llm(query = "What is RetrievalQA Chain ? "), )
|
| 33 |
+
|
| 34 |
+
|
consts.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
INDEX_NAME = "langchain-doc-index"
|
ingestion.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from langchain.document_loaders import ReadTheDocsLoader
|
| 3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 4 |
+
from langchain.embeddings import OpenAIEmbeddings
|
| 5 |
+
from langchain.vectorstores import Pinecone
|
| 6 |
+
import pinecone
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
|
| 9 |
+
from consts import INDEX_NAME
|
| 10 |
+
|
| 11 |
+
load_dotenv()
|
| 12 |
+
|
| 13 |
+
pinecone.init(
|
| 14 |
+
api_key=os.environ.get("PINECONE_API_KEY"),
|
| 15 |
+
environment=os.environ.get("PINECONE_ENVIRONMENT_REGION"),
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def ingest_docs() -> None:
|
| 20 |
+
# openai_api_key=os.environ.get("OPENAI_API_KEY")
|
| 21 |
+
loader = ReadTheDocsLoader(path="langchain-docs/langchain.readthedocs.io/en/latest")
|
| 22 |
+
raw_documents = loader.load()
|
| 23 |
+
print(f"loaded {len(raw_documents) }documents")
|
| 24 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 25 |
+
chunk_size=1000, chunk_overlap=100, separators=["\n\n", "\n", " ", ""]
|
| 26 |
+
)
|
| 27 |
+
documents = text_splitter.split_documents(documents=raw_documents)
|
| 28 |
+
print(f"Splitted into {len(documents)} chunks")
|
| 29 |
+
|
| 30 |
+
for doc in documents:
|
| 31 |
+
old_path = doc.metadata["source"]
|
| 32 |
+
new_url = old_path.replace("langchain-docs", "https:/")
|
| 33 |
+
doc.metadata.update({"source": new_url})
|
| 34 |
+
|
| 35 |
+
print(f"Going to insert {len(documents)} to Pinecone")
|
| 36 |
+
embeddings = OpenAIEmbeddings()
|
| 37 |
+
Pinecone.from_documents(documents, embeddings, index_name=INDEX_NAME)
|
| 38 |
+
print("****** Added to Pinecone vectorstore vectors")
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
if __name__ == "__main__":
|
| 42 |
+
ingest_docs()
|
requirements.txt
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
aiohttp==3.9.1
|
| 2 |
+
aiosignal==1.3.1
|
| 3 |
+
altair==5.2.0
|
| 4 |
+
annotated-types==0.6.0
|
| 5 |
+
anyio==4.2.0
|
| 6 |
+
async-timeout==4.0.3
|
| 7 |
+
attrs==23.2.0
|
| 8 |
+
backoff==2.2.1
|
| 9 |
+
beautifulsoup4==4.12.2
|
| 10 |
+
black==23.12.1
|
| 11 |
+
blinker==1.7.0
|
| 12 |
+
cachetools==5.3.2
|
| 13 |
+
certifi==2023.11.17
|
| 14 |
+
chardet==5.2.0
|
| 15 |
+
charset-normalizer==3.3.2
|
| 16 |
+
click==8.1.7
|
| 17 |
+
dataclasses-json==0.6.3
|
| 18 |
+
distro==1.9.0
|
| 19 |
+
dnspython==2.4.2
|
| 20 |
+
emoji==2.9.0
|
| 21 |
+
exceptiongroup==1.2.0
|
| 22 |
+
fastapi==0.108.0
|
| 23 |
+
filetype==1.2.0
|
| 24 |
+
frozenlist==1.4.1
|
| 25 |
+
gitdb==4.0.11
|
| 26 |
+
GitPython==3.1.40
|
| 27 |
+
greenlet==3.0.3
|
| 28 |
+
h11==0.14.0
|
| 29 |
+
httpcore==1.0.2
|
| 30 |
+
httpx==0.26.0
|
| 31 |
+
idna==3.6
|
| 32 |
+
importlib-metadata==6.11.0
|
| 33 |
+
install==1.3.5
|
| 34 |
+
Jinja2==3.1.2
|
| 35 |
+
joblib==1.3.2
|
| 36 |
+
jsonpatch==1.33
|
| 37 |
+
jsonpath-python==1.0.6
|
| 38 |
+
jsonpointer==2.4
|
| 39 |
+
jsonschema==4.20.0
|
| 40 |
+
jsonschema-specifications==2023.12.1
|
| 41 |
+
langchain==0.0.354
|
| 42 |
+
langchain-community==0.0.8
|
| 43 |
+
langchain-core==0.1.5
|
| 44 |
+
langdetect==1.0.9
|
| 45 |
+
langsmith==0.0.77
|
| 46 |
+
loguru==0.7.2
|
| 47 |
+
lxml==5.0.0
|
| 48 |
+
markdown-it-py==3.0.0
|
| 49 |
+
MarkupSafe==2.1.3
|
| 50 |
+
marshmallow==3.20.1
|
| 51 |
+
mdurl==0.1.2
|
| 52 |
+
multidict==6.0.4
|
| 53 |
+
mypy-extensions==1.0.0
|
| 54 |
+
nltk==3.8.1
|
| 55 |
+
numpy==1.26.3
|
| 56 |
+
openai==1.6.1
|
| 57 |
+
packaging==23.2
|
| 58 |
+
pandas==2.1.4
|
| 59 |
+
pathspec==0.12.1
|
| 60 |
+
pillow==10.2.0
|
| 61 |
+
pinecone-client==2.2.4
|
| 62 |
+
platformdirs==4.1.0
|
| 63 |
+
protobuf==4.25.1
|
| 64 |
+
pyarrow==14.0.2
|
| 65 |
+
pydantic==2.5.3
|
| 66 |
+
pydantic_core==2.14.6
|
| 67 |
+
pydeck==0.8.1b0
|
| 68 |
+
Pygments==2.17.2
|
| 69 |
+
PyMuPDF==1.23.8
|
| 70 |
+
PyMuPDFb==1.23.7
|
| 71 |
+
python-dateutil==2.8.2
|
| 72 |
+
python-dotenv==1.0.0
|
| 73 |
+
python-iso639==2024.1.2
|
| 74 |
+
python-magic==0.4.27
|
| 75 |
+
pytz==2023.3.post1
|
| 76 |
+
PyYAML==6.0.1
|
| 77 |
+
rapidfuzz==3.6.1
|
| 78 |
+
referencing==0.32.0
|
| 79 |
+
regex==2023.12.25
|
| 80 |
+
requests==2.31.0
|
| 81 |
+
rich==13.7.0
|
| 82 |
+
rpds-py==0.16.2
|
| 83 |
+
six==1.16.0
|
| 84 |
+
smmap==5.0.1
|
| 85 |
+
sniffio==1.3.0
|
| 86 |
+
soupsieve==2.5
|
| 87 |
+
SQLAlchemy==2.0.25
|
| 88 |
+
starlette==0.32.0.post1
|
| 89 |
+
streamlit==1.29.0
|
| 90 |
+
streamlit-chat==0.1.1
|
| 91 |
+
tabulate==0.9.0
|
| 92 |
+
tenacity==8.2.3
|
| 93 |
+
tiktoken==0.5.2
|
| 94 |
+
toml==0.10.2
|
| 95 |
+
tomli==2.0.1
|
| 96 |
+
toolz==0.12.0
|
| 97 |
+
tornado==6.4
|
| 98 |
+
tqdm==4.66.1
|
| 99 |
+
typing-inspect==0.9.0
|
| 100 |
+
typing_extensions==4.9.0
|
| 101 |
+
tzdata==2023.4
|
| 102 |
+
tzlocal==5.2
|
| 103 |
+
unstructured==0.11.7
|
| 104 |
+
unstructured-client==0.15.1
|
| 105 |
+
urllib3==2.1.0
|
| 106 |
+
uvicorn==0.25.0
|
| 107 |
+
validators==0.22.0
|
| 108 |
+
wrapt==1.16.0
|
| 109 |
+
yarl==1.9.4
|
| 110 |
+
zipp==3.17.0
|