Spaces:
Paused
Paused
delete rag.py
Browse files
rag.py
DELETED
|
@@ -1,47 +0,0 @@
|
|
| 1 |
-
from pathlib import Path
|
| 2 |
-
from typing import List, Dict, Any, Optional
|
| 3 |
-
from langchain_community.document_loaders import PyPDFLoader
|
| 4 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 5 |
-
from langchain_community.vectorstores import FAISS
|
| 6 |
-
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 7 |
-
|
| 8 |
-
INDEX_DIR = Path("data/vectorstore/faiss_index")
|
| 9 |
-
INDEX_DIR.mkdir(parents=True, exist_ok=True)
|
| 10 |
-
|
| 11 |
-
# Small + strong enough CPU embedding
|
| 12 |
-
EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
| 13 |
-
|
| 14 |
-
def load_embeddings():
|
| 15 |
-
return HuggingFaceEmbeddings(model_name=EMB_MODEL)
|
| 16 |
-
|
| 17 |
-
def split_pdf(file_path: str):
|
| 18 |
-
loader = PyPDFLoader(file_path)
|
| 19 |
-
pages = loader.load()
|
| 20 |
-
splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
|
| 21 |
-
return splitter.split_documents(pages)
|
| 22 |
-
|
| 23 |
-
def _faiss_paths():
|
| 24 |
-
return str(INDEX_DIR / "index.faiss"), str(INDEX_DIR / "index.pkl")
|
| 25 |
-
|
| 26 |
-
def load_or_create_faiss(emb):
|
| 27 |
-
faiss_path, pkl_path = _faiss_paths()
|
| 28 |
-
if Path(faiss_path).exists() and Path(pkl_path).exists():
|
| 29 |
-
return FAISS.load_local(INDEX_DIR, emb, allow_dangerous_deserialization=True)
|
| 30 |
-
# empty new index
|
| 31 |
-
return FAISS.from_texts([""], emb).delete(["0"]) or FAISS(embeddings=emb, index=None, docstore=None, index_to_docstore_id=None)
|
| 32 |
-
|
| 33 |
-
def add_pdf_to_index(file_path: str, metadata: Optional[Dict[str, Any]] = None):
|
| 34 |
-
emb = load_embeddings()
|
| 35 |
-
vectordb = load_or_create_faiss(emb)
|
| 36 |
-
splits = split_pdf(file_path)
|
| 37 |
-
# attach metadata to each chunk
|
| 38 |
-
md = metadata or {}
|
| 39 |
-
for d in splits:
|
| 40 |
-
d.metadata.update(md)
|
| 41 |
-
vectordb.add_documents(splits)
|
| 42 |
-
vectordb.save_local(INDEX_DIR)
|
| 43 |
-
|
| 44 |
-
def get_retriever(k: int = 4):
|
| 45 |
-
emb = load_embeddings()
|
| 46 |
-
vectordb = load_or_create_faiss(emb)
|
| 47 |
-
return vectordb.as_retriever(search_kwargs={"k": k})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|