Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pickle | |
| import os | |
| import json | |
| from collections import defaultdict | |
| from langchain.vectorstores import FAISS | |
| from langchain.embeddings.huggingface import HuggingFaceEmbeddings | |
| from rank_bm25 import BM25Okapi | |
| # Constants | |
| BASE_DIR = "built_index" | |
| VECTOR_STORE_DIR = os.path.join(BASE_DIR, "vector_store") | |
| BM25_INDEX_FILE = os.path.join(BASE_DIR, "bm25_index.pkl") | |
| SEARCH_INDEX_FILE = os.path.join(BASE_DIR, "search_index.json") | |
| # Load embedding model | |
| def load_embeddings(): | |
| return HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
| # Load indexes | |
| def load_indexes(): | |
| # Load search index | |
| with open(SEARCH_INDEX_FILE, "r") as f: | |
| index = defaultdict(dict, json.load(f)) | |
| # Load vector store | |
| embeddings = load_embeddings() | |
| vector_store = FAISS.load_local(VECTOR_STORE_DIR, embeddings, allow_dangerous_deserialization=True) | |
| # Load BM25 index | |
| with open(BM25_INDEX_FILE, "rb") as f: | |
| bm25, bm25_texts, url_order = pickle.load(f) | |
| return index, vector_store, bm25, bm25_texts, url_order | |
| # Search functions | |
| def semantic_search(vector_store, query, k=5): | |
| results = vector_store.similarity_search(query, k=k) | |
| return [{ | |
| "url": r.metadata.get("url", "N/A"), | |
| "snippet": r.page_content[:200] | |
| } for r in results] | |
| def bm25_search(bm25, bm25_texts, url_order, index, query, k=5): | |
| query_tokens = query.lower().split() | |
| scores = bm25.get_scores(query_tokens) | |
| top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k] | |
| return [{ | |
| "url": url_order[i], | |
| "score": scores[i], | |
| "snippet": index[url_order[i]]["content"][:200] | |
| } for i in top_indices] | |
| # Streamlit UI | |
| def main(): | |
| st.set_page_config(page_title="LangChain Search Engine") | |
| st.title("LangChain Search Engine ๐") | |
| st.markdown("Using Dense Search and Sparse Search. Indexed on April 02, 2025") | |
| st.markdown("for more details visit https://github.com/balnarendrasapa/search-engine") | |
| query = st.text_input("Enter your search query:", "") | |
| if query: | |
| index, vector_store, bm25, bm25_texts, url_order = load_indexes() | |
| with st.spinner("Searching..."): | |
| sem_results = semantic_search(vector_store, query) | |
| bm25_results = bm25_search(bm25, bm25_texts, url_order, index, query) | |
| st.subheader("๐ Semantic Search Results") | |
| for i, res in enumerate(sem_results, 1): | |
| st.markdown(f"**{i}. [{res['url']}]({res['url']})**") | |
| st.write(res['snippet'] + "...") | |
| st.subheader("๐งฎ BM25 Sparse Search Results") | |
| for i, res in enumerate(bm25_results, 1): | |
| st.markdown(f"**{i}. [{res['url']}]({res['url']})** (Score: {res['score']:.2f})") | |
| st.write(res['snippet'] + "...") | |
| if __name__ == "__main__": | |
| main() | |