luciagomez commited on
Commit
110cabc
·
verified ·
1 Parent(s): 98c4bb5

delete rag.py

Browse files
Files changed (1) hide show
  1. rag.py +0 -47
rag.py DELETED
@@ -1,47 +0,0 @@
1
- from pathlib import Path
2
- from typing import List, Dict, Any, Optional
3
- from langchain_community.document_loaders import PyPDFLoader
4
- from langchain.text_splitter import RecursiveCharacterTextSplitter
5
- from langchain_community.vectorstores import FAISS
6
- from langchain_community.embeddings import HuggingFaceEmbeddings
7
-
8
- INDEX_DIR = Path("data/vectorstore/faiss_index")
9
- INDEX_DIR.mkdir(parents=True, exist_ok=True)
10
-
11
- # Small + strong enough CPU embedding
12
- EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
13
-
14
- def load_embeddings():
15
- return HuggingFaceEmbeddings(model_name=EMB_MODEL)
16
-
17
- def split_pdf(file_path: str):
18
- loader = PyPDFLoader(file_path)
19
- pages = loader.load()
20
- splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
21
- return splitter.split_documents(pages)
22
-
23
- def _faiss_paths():
24
- return str(INDEX_DIR / "index.faiss"), str(INDEX_DIR / "index.pkl")
25
-
26
- def load_or_create_faiss(emb):
27
- faiss_path, pkl_path = _faiss_paths()
28
- if Path(faiss_path).exists() and Path(pkl_path).exists():
29
- return FAISS.load_local(INDEX_DIR, emb, allow_dangerous_deserialization=True)
30
- # empty new index
31
- return FAISS.from_texts([""], emb).delete(["0"]) or FAISS(embeddings=emb, index=None, docstore=None, index_to_docstore_id=None)
32
-
33
- def add_pdf_to_index(file_path: str, metadata: Optional[Dict[str, Any]] = None):
34
- emb = load_embeddings()
35
- vectordb = load_or_create_faiss(emb)
36
- splits = split_pdf(file_path)
37
- # attach metadata to each chunk
38
- md = metadata or {}
39
- for d in splits:
40
- d.metadata.update(md)
41
- vectordb.add_documents(splits)
42
- vectordb.save_local(INDEX_DIR)
43
-
44
- def get_retriever(k: int = 4):
45
- emb = load_embeddings()
46
- vectordb = load_or_create_faiss(emb)
47
- return vectordb.as_retriever(search_kwargs={"k": k})