Spaces:
Runtime error
Runtime error
FAISS vector db for HF spaces compatibility
Browse files- fast_app.py +9 -12
- ingest.py +5 -7
- requirements.txt +7 -3
- stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/data_level0.bin +0 -3
- stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/header.bin +0 -3
- stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/length.bin +0 -3
- stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/link_lists.bin +0 -0
- stores/czech_512/chroma.sqlite3 +0 -3
- stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/data_level0.bin +0 -3
- stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/header.bin +0 -3
- stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/index_metadata.pickle +0 -3
- stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/length.bin +0 -3
- stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/link_lists.bin +0 -3
- stores/english_512/chroma.sqlite3 +0 -3
fast_app.py
CHANGED
|
@@ -7,20 +7,15 @@ from fastapi.templating import Jinja2Templates
|
|
| 7 |
from fastapi.staticfiles import StaticFiles
|
| 8 |
from fastapi.encoders import jsonable_encoder
|
| 9 |
|
| 10 |
-
from
|
| 11 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 12 |
|
| 13 |
from langchain.chains import RetrievalQA
|
| 14 |
-
|
| 15 |
-
TextLoader,
|
| 16 |
-
PyPDFLoader,
|
| 17 |
-
DirectoryLoader,
|
| 18 |
-
UnstructuredFileLoader,
|
| 19 |
-
)
|
| 20 |
-
from langchain.document_loaders.csv_loader import CSVLoader
|
| 21 |
from langchain.llms import OpenAI
|
| 22 |
from langchain import PromptTemplate
|
| 23 |
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
|
|
|
|
| 24 |
|
| 25 |
from ingest import Ingest
|
| 26 |
|
|
@@ -31,7 +26,7 @@ from ingest import Ingest
|
|
| 31 |
# if huggingface_token is None:
|
| 32 |
# raise ValueError("Hugging Face token is not set in environment variables.")
|
| 33 |
|
| 34 |
-
openai_api_key =
|
| 35 |
if openai_api_key is None:
|
| 36 |
raise ValueError("OAI token is not set in environment variables.")
|
| 37 |
|
|
@@ -39,8 +34,8 @@ if openai_api_key is None:
|
|
| 39 |
app = FastAPI()
|
| 40 |
templates = Jinja2Templates(directory="templates")
|
| 41 |
app.mount("/static", StaticFiles(directory="static"), name="static")
|
| 42 |
-
english_embedding_model="text-embedding-3-large"
|
| 43 |
-
czech_embedding_model="Seznam/simcse-dist-mpnet-paracrawl-cs-en"
|
| 44 |
|
| 45 |
czech_store = "stores/czech_512"
|
| 46 |
english_store = "stores/english_512"
|
|
@@ -55,6 +50,7 @@ ingestor = Ingest(
|
|
| 55 |
english_embedding_model=english_embedding_model,
|
| 56 |
)
|
| 57 |
|
|
|
|
| 58 |
def prompt_en():
|
| 59 |
prompt_template_en = """You are electrical engineer and you answer users ###Question.
|
| 60 |
|
|
@@ -75,6 +71,7 @@ def prompt_en():
|
|
| 75 |
print("\n Prompt ready... \n\n")
|
| 76 |
return prompt_en
|
| 77 |
|
|
|
|
| 78 |
def prompt_cz():
|
| 79 |
prompt_template_cz = """Jste elektroinženýr a odpovídáte uživatelům na ###Otázku.
|
| 80 |
|
|
@@ -144,7 +141,7 @@ async def get_response(query: str = Form(...), language: str = Form(...)):
|
|
| 144 |
model=embedding_model,
|
| 145 |
)
|
| 146 |
|
| 147 |
-
vectordb =
|
| 148 |
retriever = vectordb.as_retriever(search_kwargs={"k": 3})
|
| 149 |
|
| 150 |
chain_type_kwargs = {"prompt": prompt}
|
|
|
|
| 7 |
from fastapi.staticfiles import StaticFiles
|
| 8 |
from fastapi.encoders import jsonable_encoder
|
| 9 |
|
| 10 |
+
from langchain_community.vectorstores import FAISS
|
| 11 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 12 |
|
| 13 |
from langchain.chains import RetrievalQA
|
| 14 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
from langchain.llms import OpenAI
|
| 16 |
from langchain import PromptTemplate
|
| 17 |
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
|
| 18 |
+
import chainlit as cl
|
| 19 |
|
| 20 |
from ingest import Ingest
|
| 21 |
|
|
|
|
| 26 |
# if huggingface_token is None:
|
| 27 |
# raise ValueError("Hugging Face token is not set in environment variables.")
|
| 28 |
|
| 29 |
+
openai_api_key = "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"
|
| 30 |
if openai_api_key is None:
|
| 31 |
raise ValueError("OAI token is not set in environment variables.")
|
| 32 |
|
|
|
|
| 34 |
app = FastAPI()
|
| 35 |
templates = Jinja2Templates(directory="templates")
|
| 36 |
app.mount("/static", StaticFiles(directory="static"), name="static")
|
| 37 |
+
english_embedding_model = "text-embedding-3-large"
|
| 38 |
+
czech_embedding_model = "Seznam/simcse-dist-mpnet-paracrawl-cs-en"
|
| 39 |
|
| 40 |
czech_store = "stores/czech_512"
|
| 41 |
english_store = "stores/english_512"
|
|
|
|
| 50 |
english_embedding_model=english_embedding_model,
|
| 51 |
)
|
| 52 |
|
| 53 |
+
|
| 54 |
def prompt_en():
|
| 55 |
prompt_template_en = """You are electrical engineer and you answer users ###Question.
|
| 56 |
|
|
|
|
| 71 |
print("\n Prompt ready... \n\n")
|
| 72 |
return prompt_en
|
| 73 |
|
| 74 |
+
|
| 75 |
def prompt_cz():
|
| 76 |
prompt_template_cz = """Jste elektroinženýr a odpovídáte uživatelům na ###Otázku.
|
| 77 |
|
|
|
|
| 141 |
model=embedding_model,
|
| 142 |
)
|
| 143 |
|
| 144 |
+
vectordb = FAISS.load_local(persist_directory, embedding)
|
| 145 |
retriever = vectordb.as_retriever(search_kwargs={"k": 3})
|
| 146 |
|
| 147 |
chain_type_kwargs = {"prompt": prompt}
|
ingest.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from
|
| 2 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 3 |
|
| 4 |
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
|
|
@@ -53,12 +53,11 @@ class Ingest:
|
|
| 53 |
)
|
| 54 |
texts = text_splitter.split_documents(documents)
|
| 55 |
|
| 56 |
-
vectordb =
|
| 57 |
documents=texts,
|
| 58 |
embedding=embedding,
|
| 59 |
-
persist_directory=self.english_store,
|
| 60 |
-
collection_metadata={"hnsw:space": "cosine"},
|
| 61 |
)
|
|
|
|
| 62 |
|
| 63 |
print("\n English vector Store Created.......\n\n")
|
| 64 |
|
|
@@ -84,12 +83,11 @@ class Ingest:
|
|
| 84 |
)
|
| 85 |
|
| 86 |
texts = text_splitter.split_documents(documents)
|
| 87 |
-
vectordb =
|
| 88 |
documents=texts,
|
| 89 |
embedding=embedding,
|
| 90 |
-
persist_directory=self.czech_store,
|
| 91 |
-
collection_metadata={"hnsw:space": "cosine"},
|
| 92 |
)
|
|
|
|
| 93 |
|
| 94 |
print("\n Czech vector Store Created.......\n\n")
|
| 95 |
|
|
|
|
| 1 |
+
from langchain_community.vectorstores import FAISS
|
| 2 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 3 |
|
| 4 |
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
|
|
|
|
| 53 |
)
|
| 54 |
texts = text_splitter.split_documents(documents)
|
| 55 |
|
| 56 |
+
vectordb = FAISS.from_documents(
|
| 57 |
documents=texts,
|
| 58 |
embedding=embedding,
|
|
|
|
|
|
|
| 59 |
)
|
| 60 |
+
vectordb.save_local(self.english_store)
|
| 61 |
|
| 62 |
print("\n English vector Store Created.......\n\n")
|
| 63 |
|
|
|
|
| 83 |
)
|
| 84 |
|
| 85 |
texts = text_splitter.split_documents(documents)
|
| 86 |
+
vectordb = FAISS.from_documents(
|
| 87 |
documents=texts,
|
| 88 |
embedding=embedding,
|
|
|
|
|
|
|
| 89 |
)
|
| 90 |
+
vectordb.save_local(self.czech_store)
|
| 91 |
|
| 92 |
print("\n Czech vector Store Created.......\n\n")
|
| 93 |
|
requirements.txt
CHANGED
|
@@ -1,5 +1,9 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
| 2 |
fastapi
|
|
|
|
|
|
|
| 3 |
uvicorn
|
| 4 |
python-multipart
|
| 5 |
ctransformers
|
|
@@ -9,8 +13,8 @@ sentence_transformers
|
|
| 9 |
chromadb
|
| 10 |
pytesseract
|
| 11 |
fitz
|
| 12 |
-
libpff-python
|
| 13 |
openai
|
| 14 |
tiktoken
|
| 15 |
frontend
|
| 16 |
-
|
|
|
|
| 1 |
+
|
| 2 |
+
langchain-community==0.0.19
|
| 3 |
+
langchain==0.1.6
|
| 4 |
fastapi
|
| 5 |
+
faiss-cpu
|
| 6 |
+
pypdf
|
| 7 |
uvicorn
|
| 8 |
python-multipart
|
| 9 |
ctransformers
|
|
|
|
| 13 |
chromadb
|
| 14 |
pytesseract
|
| 15 |
fitz
|
| 16 |
+
#libpff-python
|
| 17 |
openai
|
| 18 |
tiktoken
|
| 19 |
frontend
|
| 20 |
+
chainlit
|
stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/data_level0.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:5f8157971983f837eca48b97187f0e8a435eb21270cd49d831db21678670bc4a
|
| 3 |
-
size 1164000
|
|
|
|
|
|
|
|
|
|
|
|
stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/header.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:9a3499aedbeb5c8ea26813ed567be6748293334099aa733c4d8cf0c4ec0ee6e3
|
| 3 |
-
size 100
|
|
|
|
|
|
|
|
|
|
|
|
stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/length.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:612e017796cdd9eef6ba562cbe8c02e16b8c07f3fbac9f1254934f02e2261084
|
| 3 |
-
size 4000
|
|
|
|
|
|
|
|
|
|
|
|
stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/link_lists.bin
DELETED
|
File without changes
|
stores/czech_512/chroma.sqlite3
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:2187862ccdfdb78565366853a939dc50038908171936c8584d69a09b55aa4e7c
|
| 3 |
-
size 1929216
|
|
|
|
|
|
|
|
|
|
|
|
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/data_level0.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:6f812eacc9c05db367748cf1e0576bdcd28e0b3eaf09d5f3095a1b0e03f71cc8
|
| 3 |
-
size 12428000
|
|
|
|
|
|
|
|
|
|
|
|
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/header.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:9882e5d786d4ca5fba4a783054685cf6e05b1637aaf586e43ec0e933e30e961d
|
| 3 |
-
size 100
|
|
|
|
|
|
|
|
|
|
|
|
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/index_metadata.pickle
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:d49c7e9538b2cfc154773a96a1fcdbf4a4247c3b510bb68d2aa6f2b24e902fca
|
| 3 |
-
size 55974
|
|
|
|
|
|
|
|
|
|
|
|
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/length.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:bd6e73e535a8843ce30d35a4ba88436bcb5687583474e276a3b1f8689c1477bd
|
| 3 |
-
size 4000
|
|
|
|
|
|
|
|
|
|
|
|
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/link_lists.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:fe35f087195e70122f597edc9b62da9d3ce370b40307b5556ebbe4e185fb46d4
|
| 3 |
-
size 8624
|
|
|
|
|
|
|
|
|
|
|
|
stores/english_512/chroma.sqlite3
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:369ede691a1330113d353e1a425a7cd24ad9d76ee61ee542adab1f12a6887146
|
| 3 |
-
size 26963968
|
|
|
|
|
|
|
|
|
|
|
|