Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| from sentence_transformers import SentenceTransformer, util | |
| from bs4 import BeautifulSoup | |
| import pandas as pd | |
| import requests | |
| import os | |
| import time | |
| def find_abstracts(soup): | |
| #df = pd.DataFrame(columns = ["identifier", "abstract"]) | |
| id_list = [] | |
| abs_list = [] | |
| title_list = [] | |
| for record in soup.find_all("csw:record"): | |
| id = record.find("dc:identifier") | |
| abs = record.find("dct:abstract") | |
| title = record.find("dc:title") | |
| # append id and abs to df | |
| #df = df.append([id.text, abs.text]) | |
| id_list.append(id.text) | |
| title_list.append(title.text) | |
| if abs != None: | |
| abs_list.append(abs.text) | |
| else: | |
| abs_list.append("NA") | |
| return id_list, title_list, abs_list | |
| def get_metadata(): | |
| # Get the abstracts from Geoportal | |
| URL = "https://www.ncei.noaa.gov/metadata/geoportal/opensearch?f=csw&from=0&size=5000&sort=title.sort" | |
| page = requests.get(URL) | |
| soup = BeautifulSoup(page.text, "lxml") | |
| id_list, title_list, abs_list = find_abstracts(soup) | |
| df = pd.DataFrame(list(zip(id_list,title_list, abs_list)), columns = ["identifier", "title", "abstract"]) | |
| df.to_csv("./ncei-metadata.csv") | |
| return df | |
| def show_model(query): | |
| path = "./ncei-metadata.csv" | |
| if os.path.exists(path): | |
| last_modified = os.path.getmtime(path) | |
| now = time.time() | |
| DAY = 86400 | |
| if (now - last_modified > DAY): | |
| df = get_metadata() | |
| else: | |
| df = pd.read_csv(path) | |
| else: | |
| df = get_metadata() | |
| # Make the abstracts the docs | |
| docs_df = df[df["abstract"] != "NA"] | |
| docs = list(docs_df["abstract"]) | |
| titles = list(docs_df["title"]) | |
| # Query | |
| query = input("Enter your query: ") | |
| # predict on a search query for data | |
| #Load the model | |
| model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1') | |
| #Encode query and documents | |
| query_emb = model.encode(query) | |
| doc_emb = model.encode(docs) | |
| #Compute dot score between query and all document embeddings | |
| scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist() | |
| #Combine docs & scores | |
| doc_score_pairs = list(zip(docs, scores, titles)) | |
| #Sort by decreasing score | |
| doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True) | |
| return doc_score_pairs | |
| def main(): | |
| st.title("Semantic Search for Datasets Using Sentence Transformers") | |
| st.write("A case study for the National Centers for Environmental Information (NCEI)") | |
| st.image("noaa_logo.png", width=150) | |
| st.write("## Goal: search for datasets in NCEI's Archive using natural language queries") | |
| st.write("[Repo](https://github.com/myrandaGoesToSpace/semantic-search-datasets)") | |
| st.image("pres-whatisnoaa.png") | |
| st.write("## The Problem Context") | |
| st.write("Uses service called OneStop for data search") | |
| st.write("**Problems:**") | |
| st.write("- Uses keyword search -- not robust to natural language queries") | |
| st.write("- Filtering options too specific for non-expert users") | |
| #st.image("pres-onestop.png") | |
| #st.image("pres-problems.png") | |
| st.write("## The Model: [Sentence Transformers](https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1)") | |
| st.image("pres-sentencetransformers.png") | |
| st.write("## Project Data") | |
| st.image("pres-metadata.png") | |
| st.write("## The Process") | |
| st.image("pres-creatingse.png") | |
| st.write("## Results and Demo") | |
| st.write("[Demo Notebook](https://github.com/myrandaGoesToSpace/semantic-search-datasets/blob/main/semantic_search.ipynb)") | |
| st.image("pres-futureplans.png") | |
| st.write("## Critical Analysis") | |
| st.write("- did not run with Streamlit text input") | |
| st.write("- only embeds the first 5000 datasets") | |
| st.write("- calculates embeddings for datasets with each run") | |
| main() |