search_engine / app.py
bnsapa's picture
Update app.py
b4c3246 verified
raw
history blame
2.89 kB
import streamlit as st
import pickle
import os
import json
from collections import defaultdict
from langchain.vectorstores import FAISS
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from rank_bm25 import BM25Okapi
# Constants
BASE_DIR = "built_index"
VECTOR_STORE_DIR = os.path.join(BASE_DIR, "vector_store")
BM25_INDEX_FILE = os.path.join(BASE_DIR, "bm25_index.pkl")
SEARCH_INDEX_FILE = os.path.join(BASE_DIR, "search_index.json")
# Load embedding model
@st.cache_resource
def load_embeddings():
return HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# Load indexes
@st.cache_resource
def load_indexes():
# Load search index
with open(SEARCH_INDEX_FILE, "r") as f:
index = defaultdict(dict, json.load(f))
# Load vector store
embeddings = load_embeddings()
vector_store = FAISS.load_local(VECTOR_STORE_DIR, embeddings, allow_dangerous_deserialization=True)
# Load BM25 index
with open(BM25_INDEX_FILE, "rb") as f:
bm25, bm25_texts, url_order = pickle.load(f)
return index, vector_store, bm25, bm25_texts, url_order
# Search functions
def semantic_search(vector_store, query, k=5):
results = vector_store.similarity_search(query, k=k)
return [{
"url": r.metadata.get("url", "N/A"),
"snippet": r.page_content[:200]
} for r in results]
def bm25_search(bm25, bm25_texts, url_order, index, query, k=5):
query_tokens = query.lower().split()
scores = bm25.get_scores(query_tokens)
top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
return [{
"url": url_order[i],
"score": scores[i],
"snippet": index[url_order[i]]["content"][:200]
} for i in top_indices]
# Streamlit UI
def main():
st.set_page_config(page_title="LangChain Search Engine")
st.title("LangChain Search Engine ๐Ÿ”")
st.markdown("Using Dense Search and Sparse Search. Indexed on April 02, 2025")
st.markdown("for more details visit https://github.com/balnarendrasapa/search-engine")
query = st.text_input("Enter your search query:", "")
if query:
index, vector_store, bm25, bm25_texts, url_order = load_indexes()
with st.spinner("Searching..."):
sem_results = semantic_search(vector_store, query)
bm25_results = bm25_search(bm25, bm25_texts, url_order, index, query)
st.subheader("๐Ÿ”Ž Semantic Search Results")
for i, res in enumerate(sem_results, 1):
st.markdown(f"**{i}. [{res['url']}]({res['url']})**")
st.write(res['snippet'] + "...")
st.subheader("๐Ÿงฎ BM25 Sparse Search Results")
for i, res in enumerate(bm25_results, 1):
st.markdown(f"**{i}. [{res['url']}]({res['url']})** (Score: {res['score']:.2f})")
st.write(res['snippet'] + "...")
if __name__ == "__main__":
main()