Spaces:

nullHawk
/

arxive-semantic-search

Running

App Files Files Community

arxive-semantic-search / app.py

nullHawk

add: streamlit app

2f9fb02 verified 2 months ago

raw

history blame

3.09 kB

	from huggingface_hub import hf_hub_download
	from gensim.models import Word2Vec

	import faiss

	import streamlit as st
	import pandas as pd
	import dask.dataframe as dd

	@st.cache_data
	def get_dask_df(df_path='bin/data.parquet'):
	return dd.read_parquet(df_path)


	@st.cache_data
	def query_rows(rows: list):
	df = get_dask_df()

	@st.cache_data
	def get_model():
	model_path = hf_hub_download(
	repo_id="nullHawk/word2vec-skipgram-arxive",
	filename="word2vec_arxiv_skipgram.model"
	)
	model_npy_path = hf_hub_download(
	repo_id="nullHawk/word2vec-skipgram-arxive",
	filename="word2vec_arxiv_skipgram.model.syn1neg.npy"
	)
	model_wv_path2 = hf_hub_download(
	repo_id="nullHawk/word2vec-skipgram-arxive",
	filename="word2vec_arxiv_skipgram.model.wv.vectors.npy"
	)

	return Word2Vec.load(model_path)

	@st.cache_data
	def get_faiss_index():
	return faiss.read_index("faiss_index.bin")



	# --------------------------------------------------------------
	# Placeholder: You will plug your search code here.
	# Should return a list of paper dicts with:
	# { "title": ..., "authors": ..., "abstract": ..., "url": ... }
	# --------------------------------------------------------------
	def run_semantic_search(query, top_k):
	# ---- Replace with your search logic ----
	# Example dummy results:
	return [
	{
	"title": "Example Paper Title",
	"authors": "John Doe, Jane Smith",
	"abstract": "This is a sample abstract describing the research paper...",
	"url": "https://arxiv.org/abs/1234.5678"
	}
	] * top_k

	# ----------------------------------
	# Streamlit Page Setup
	# ----------------------------------
	st.set_page_config(page_title="ArXiv Semantic Search", layout="wide")

	st.title("🔎 ArXiv Semantic Search Engine")
	st.write("Search over millions of research papers using semantic similarity.")

	# Sidebar
	st.sidebar.header("⚙️ Search Options")
	top_k = st.sidebar.slider("Top K Results", 5, 50, 10)

	# Main Search Bar
	query = st.text_input(
	"Enter your search query:",
	placeholder="e.g. diffusion models for text-to-image, graph neural networks, LLM alignment..."
	)

	search_button = st.button("Search")


	# --------------------------------------------------------------
	# Handle search click
	# --------------------------------------------------------------
	if search_button and query.strip():
	with st.spinner("Searching... 🚀"):
	results = run_semantic_search(query, top_k)

	st.subheader(f"Top {top_k} Results")

	# ----------------------------------------------------------
	# Display results (card-style)
	# ----------------------------------------------------------
	for i, paper in enumerate(results, start=1):
	st.markdown(f"### {i}. {paper['title']}")

	st.markdown(f"Authors: {paper['authors']}")
	st.markdown(f"[🔗 View on arXiv]({paper['url']})")

	with st.expander("Abstract Preview"):
	st.write(paper["abstract"][:600] + "...")

	st.markdown("---")