|
|
import os |
|
|
import sys |
|
|
from langchain_community.vectorstores import FAISS |
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
|
|
|
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) |
|
|
from rag.logger import get_logger |
|
|
logger = get_logger(__name__) |
|
|
|
|
|
def load_vectorstore(): |
|
|
embeddings = HuggingFaceEmbeddings( |
|
|
model_name="sentence-transformers/all-MiniLM-L6-v2" |
|
|
) |
|
|
|
|
|
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
|
|
vectorstore_path = os.path.join(base_dir, "data", "vectorstores") |
|
|
|
|
|
if not os.path.exists(vectorstore_path): |
|
|
raise FileNotFoundError( |
|
|
f"Vectorstore not found at: {vectorstore_path}\nRun ingest.py first." |
|
|
) |
|
|
|
|
|
vectorstore = FAISS.load_local( |
|
|
vectorstore_path, |
|
|
embeddings, |
|
|
allow_dangerous_deserialization=True |
|
|
) |
|
|
logger.info('Vector store loaded') |
|
|
return vectorstore |
|
|
|
|
|
def get_retriever(top_k: int = 5): |
|
|
vectorstore = load_vectorstore() |
|
|
|
|
|
|
|
|
retriever = vectorstore.as_retriever( |
|
|
search_kwargs={"k": top_k} |
|
|
) |
|
|
logger.info('Retrieval Complete') |
|
|
return retriever |
|
|
|
|
|
|
|
|
def search(query: str, top_k: int = 5): |
|
|
vectorstore = load_vectorstore() |
|
|
|
|
|
results = vectorstore.similarity_search(query, k=top_k) |
|
|
|
|
|
return results |
|
|
|
|
|
if __name__ == "__main__": |
|
|
query = "GITHUB REPO DATA" |
|
|
results = search(query, top_k=5) |
|
|
|
|
|
print("\n=== SIMILARITY RESULTS ===") |
|
|
for r in results: |
|
|
print("\n--- CHUNK ---") |
|
|
print(r.page_content) |
|
|
print("Metadata:", r.metadata) |
|
|
|