Spaces:

datacommonsorg
/

dc_statvar_demo

Runtime error

App Files Files Community

Prashanth Radhakrishnan commited on Dec 20, 2022

Commit

fc54c76

1 Parent(s): f8f9456

Deploy to HF

Browse files

Files changed (5) hide show

README.md +5 -5
app.py +87 -0
embeddings_demographics300.csv +3 -0
embeddings_uncurated3000.csv +3 -0
requirements.txt +4 -0

README.md CHANGED Viewed

@@ -1,13 +1,13 @@
 ---
-title: Dc Statvar Demo
-emoji: 🔥
-colorFrom: pink
-colorTo: green
 sdk: gradio
 sdk_version: 3.14.0
 app_file: app.py
 pinned: false
-license: cc-by-sa-4.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Data Commons Variables Search - Demo
+emoji: 🦀
+colorFrom: blue
+colorTo: red
 sdk: gradio
 sdk_version: 3.14.0
 app_file: app.py
 pinned: false
+license: cc-by-4.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import gradio as gr
+import os
+import pandas as pd
+import torch
+from datasets import load_dataset
+from sentence_transformers.util import semantic_search
+from sentence_transformers import SentenceTransformer, util
+BUILDS = ['demographics300', 'uncurated3000']
+# Download model
+model = SentenceTransformer('all-MiniLM-L6-v2')
+# Load embeddings
+dataset_embeddings_maps = {}
+dcid_maps = {}
+for build in BUILDS:
+  print('Loading build ', build)
+  ds = load_dataset('csv', data_files=f'embeddings_{build}.csv')
+  df = ds["train"].to_pandas()
+  dcid_maps[build] = df['dcid'].values.tolist()
+  df = df.drop('dcid', axis=1)
+  dataset_embeddings_maps[build] = torch.from_numpy(df.to_numpy()).to(torch.float)
+def inference(build, query):
+  query_embeddings = model.encode([query])
+  # Note: multiple results may map to the same DCID. As well, the same string may
+  hits = semantic_search(query_embeddings, dataset_embeddings_maps[build], top_k=15)
+  # map to multiple DCIDs with the same score.
+  sv2score = {}
+  score2svs = {}
+  for e in hits[0]:
+    for d in dcid_maps[build][e['corpus_id']].split(','):
+      s = e['score']
+      # Prefer the top score.
+      if d not in sv2score:
+        sv2score[d] = s
+        if s not in score2svs:
+          score2svs[s] = [d]
+        else:
+          score2svs[s].append(d)
+  # Sort by scores
+  scores = [s for s in sorted(score2svs.keys(), reverse=True)]
+  svs = [' : '.join(score2svs[s]) for s in scores]
+  # Addd to Pandas
+  result = pd.DataFrame({'SV': svs, 'Cosine Score': scores})
+  return result
+# Create a simple search interface
+title = "DC Search Demo"
+description = """
+Try querying for StatVars.
+- "demographics300": 300 SVs with curated descriptions (http://shortn/_iJbtpD2uwF)
+  related to demographics
+- "uncurated3000": 3000 SVs with only auto-generated name related to
+  demographics, crime, agriculture, households, housing, emissions, health
+"""
+# TODO: make logging work
+# HF_TOKEN = os.getenv('HF_TOKEN')
+# hf_writer = gr.HuggingFaceDatasetSaver(HF_TOKEN, "dc-statvar-demo-log")
+iface = gr.Interface(fn=inference,
+                     inputs=[
+                         gr.Dropdown(choices=BUILDS,
+                                     value='uncurated3000',
+                                     label='Embeddings Build'),
+                         gr.Textbox(label='Query',
+                                    placeholder='how long do people live?')
+                     ],
+                     outputs=gr.Dataframe(headers=['SV', 'Cosine Score'],
+                                          label='Search Results'),
+                     title=title,
+                     description=description,
+                     allow_flagging="manual",
+                     flagging_options=["not at all related",
+                                       "related but not ranked right"])
+iface.launch()

embeddings_demographics300.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ff78405404869bde80d50b2e567314a6c78eed6092e253d8220141a05f4230e
+size 2187591

embeddings_uncurated3000.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c61a3e8986a29ff51d8796fc38c786d0ac85e218db7c8ef2e30ceefdd871e033
+size 16402373

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+datasets
+gradio
+pandas
+sentence-transformers