Prashanth Radhakrishnan commited on
Commit
fc54c76
·
1 Parent(s): f8f9456

Deploy to HF

Browse files
README.md CHANGED
@@ -1,13 +1,13 @@
1
  ---
2
- title: Dc Statvar Demo
3
- emoji: 🔥
4
- colorFrom: pink
5
- colorTo: green
6
  sdk: gradio
7
  sdk_version: 3.14.0
8
  app_file: app.py
9
  pinned: false
10
- license: cc-by-sa-4.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Data Commons Variables Search - Demo
3
+ emoji: 🦀
4
+ colorFrom: blue
5
+ colorTo: red
6
  sdk: gradio
7
  sdk_version: 3.14.0
8
  app_file: app.py
9
  pinned: false
10
+ license: cc-by-4.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import pandas as pd
4
+ import torch
5
+ from datasets import load_dataset
6
+ from sentence_transformers.util import semantic_search
7
+ from sentence_transformers import SentenceTransformer, util
8
+
9
+ BUILDS = ['demographics300', 'uncurated3000']
10
+
11
+ # Download model
12
+ model = SentenceTransformer('all-MiniLM-L6-v2')
13
+
14
+ # Load embeddings
15
+ dataset_embeddings_maps = {}
16
+ dcid_maps = {}
17
+ for build in BUILDS:
18
+ print('Loading build ', build)
19
+ ds = load_dataset('csv', data_files=f'embeddings_{build}.csv')
20
+
21
+ df = ds["train"].to_pandas()
22
+ dcid_maps[build] = df['dcid'].values.tolist()
23
+ df = df.drop('dcid', axis=1)
24
+
25
+ dataset_embeddings_maps[build] = torch.from_numpy(df.to_numpy()).to(torch.float)
26
+
27
+
28
+ def inference(build, query):
29
+ query_embeddings = model.encode([query])
30
+
31
+ # Note: multiple results may map to the same DCID. As well, the same string may
32
+ hits = semantic_search(query_embeddings, dataset_embeddings_maps[build], top_k=15)
33
+ # map to multiple DCIDs with the same score.
34
+ sv2score = {}
35
+ score2svs = {}
36
+ for e in hits[0]:
37
+ for d in dcid_maps[build][e['corpus_id']].split(','):
38
+ s = e['score']
39
+ # Prefer the top score.
40
+ if d not in sv2score:
41
+ sv2score[d] = s
42
+ if s not in score2svs:
43
+ score2svs[s] = [d]
44
+ else:
45
+ score2svs[s].append(d)
46
+
47
+ # Sort by scores
48
+ scores = [s for s in sorted(score2svs.keys(), reverse=True)]
49
+ svs = [' : '.join(score2svs[s]) for s in scores]
50
+
51
+ # Addd to Pandas
52
+ result = pd.DataFrame({'SV': svs, 'Cosine Score': scores})
53
+ return result
54
+
55
+
56
+ # Create a simple search interface
57
+ title = "DC Search Demo"
58
+ description = """
59
+ Try querying for StatVars.
60
+
61
+ - "demographics300": 300 SVs with curated descriptions (http://shortn/_iJbtpD2uwF)
62
+ related to demographics
63
+ - "uncurated3000": 3000 SVs with only auto-generated name related to
64
+ demographics, crime, agriculture, households, housing, emissions, health
65
+ """
66
+
67
+ # TODO: make logging work
68
+ # HF_TOKEN = os.getenv('HF_TOKEN')
69
+ # hf_writer = gr.HuggingFaceDatasetSaver(HF_TOKEN, "dc-statvar-demo-log")
70
+
71
+ iface = gr.Interface(fn=inference,
72
+ inputs=[
73
+ gr.Dropdown(choices=BUILDS,
74
+ value='uncurated3000',
75
+ label='Embeddings Build'),
76
+ gr.Textbox(label='Query',
77
+ placeholder='how long do people live?')
78
+ ],
79
+ outputs=gr.Dataframe(headers=['SV', 'Cosine Score'],
80
+ label='Search Results'),
81
+ title=title,
82
+ description=description,
83
+ allow_flagging="manual",
84
+ flagging_options=["not at all related",
85
+ "related but not ranked right"])
86
+
87
+ iface.launch()
embeddings_demographics300.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ff78405404869bde80d50b2e567314a6c78eed6092e253d8220141a05f4230e
3
+ size 2187591
embeddings_uncurated3000.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c61a3e8986a29ff51d8796fc38c786d0ac85e218db7c8ef2e30ceefdd871e033
3
+ size 16402373
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ datasets
2
+ gradio
3
+ pandas
4
+ sentence-transformers