NealCaren commited on
Commit
35d8ab8
·
0 Parent(s):

Duplicate from NealCaren/paragraphs

Browse files
Files changed (5) hide show
  1. .gitattributes +36 -0
  2. README.md +13 -0
  3. app.py +189 -0
  4. emerac.png +0 -0
  5. requirements.txt +5 -0
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.npy filter=lfs diff=lfs merge=lfs -text
13
+ *.npz filter=lfs diff=lfs merge=lfs -text
14
+ *.onnx filter=lfs diff=lfs merge=lfs -text
15
+ *.ot filter=lfs diff=lfs merge=lfs -text
16
+ *.parquet filter=lfs diff=lfs merge=lfs -text
17
+ *.pickle filter=lfs diff=lfs merge=lfs -text
18
+ *.pkl filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pt filter=lfs diff=lfs merge=lfs -text
21
+ *.pth filter=lfs diff=lfs merge=lfs -text
22
+ *.rar filter=lfs diff=lfs merge=lfs -text
23
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
24
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
25
+ *.tflite filter=lfs diff=lfs merge=lfs -text
26
+ *.tgz filter=lfs diff=lfs merge=lfs -text
27
+ *.wasm filter=lfs diff=lfs merge=lfs -text
28
+ *.xz filter=lfs diff=lfs merge=lfs -text
29
+ *.zip filter=lfs diff=lfs merge=lfs -text
30
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
31
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
32
+ passages_0.jsonl filter=lfs diff=lfs merge=lfs -text
33
+ passages_4.jsonl filter=lfs diff=lfs merge=lfs -text
34
+ passages_1.jsonl filter=lfs diff=lfs merge=lfs -text
35
+ passages_2.jsonl filter=lfs diff=lfs merge=lfs -text
36
+ passages_3.jsonl filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Paragraphs
3
+ emoji: 😻
4
+ colorFrom: gray
5
+ colorTo: green
6
+ sdk: streamlit
7
+ sdk_version: 1.10.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: NealCaren/paragraphs
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ import numpy as np
4
+ import re
5
+ import pickle
6
+ from collections import OrderedDict
7
+ import io
8
+
9
+ from sentence_transformers import SentenceTransformer, CrossEncoder, util
10
+ import torch
11
+
12
+ from nltk.tokenize import sent_tokenize
13
+ import nltk
14
+
15
+ import gdown
16
+ import requests
17
+ from PIL import Image
18
+
19
+
20
+ # Trying to figure out some CSS stuff
21
+
22
+ st.markdown(
23
+ """
24
+ <style>
25
+ .streamlit-expanderHeader {
26
+ font-size: medium;
27
+ }
28
+ </style>
29
+ """,
30
+ unsafe_allow_html=True,
31
+ )
32
+
33
+
34
+ nltk.download('punkt')
35
+
36
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
37
+
38
+ import pandas as pd
39
+
40
+ purl = st.secrets["graphs_url"]
41
+ print(purl)
42
+
43
+
44
+ @st.cache
45
+ def load_embeddings():
46
+ url = "https://drive.google.com/uc?export=download&id=1z9eoBI07p_YtrdK1ZWZeCRT5T5mu5nhV"
47
+ output = "embeddings.npy"
48
+ gdown.download(url, output, quiet=False)
49
+
50
+ corpus_embeddings = np.load(output)
51
+ return corpus_embeddings
52
+
53
+ @st.cache
54
+ def load_data(url):
55
+ #url = "https://drive.google.com/uc?export=download&id=1nIBS9is8YCeiPBqA7MifVC5xeaKWH8uL"
56
+ output = "passages.jsonl"
57
+ gdown.download(url, output, quiet=False)
58
+
59
+ df = pd.read_json(output, lines=True)
60
+
61
+ df.reset_index(inplace=True, drop=True)
62
+ return df
63
+
64
+
65
+ st.title('Sociology EMERAC')
66
+
67
+ st.write('This project is a work-in-progress that searches the text of recently-published articles from a few sociology journals and retrieves the most relevant paragraphs.')
68
+
69
+
70
+ with st.spinner(text="Loading data..."):
71
+ df = load_data(purl)
72
+ passages = df['text'].values
73
+
74
+ no_of_graphs=len(df)
75
+ no_of_articles = len(df['cite'].value_counts())
76
+
77
+
78
+ notes = f'''Notes:
79
+ * I have found three types of searches work best:
80
+ * Phrases or specific topics, such as "inequality in latin america", "race color skin tone measurement", "audit study experiment gender", or "logistic regression or linear probability model".
81
+ * Citations to well-known works, either using author year ("bourdieu 1984") or author idea ("Crenshaw intersectionality")
82
+ * Questions, like "What is a topic model?" or "How did Weber define bureaucracy?"
83
+ * The search expands beyond exact matching, so "asia social movements" may return paragraphs on Asian-Americans politics and South Korean labor unions.
84
+ * The first search can take up to 10 seconds as the files load. After that, it's quicker to respond.
85
+ * The most relevant paragraph to your search is returned first, along with up to four other related paragraphs from that article.
86
+ * The most relevant sentence within each paragraph, as determined by math, is displayed. Click on it to see the full paragraph.
87
+ * The results are not exhaustive, and seem to drift off even when you suspect there are more relevant articles :man-shrugging:.
88
+ * The dataset currently includes {no_of_graphs:,} paragraphs from {no_of_articles:,} published in the last five years in *Mobilization*, *Social Forces*, *Social Problems*, *Sociology of Race and Ethnicity*, *Gender and Society*, *Socius*, *JHSB*, *Annual Review of Sociology*, and the *American Sociological Review*.
89
+ * Behind the scenes, the semantic search uses [text embeddings](https://www.sbert.net) with a [retrieve & re-rank](https://colab.research.google.com/github/UKPLab/sentence-transformers/blob/master/examples/applications/retrieve_rerank/retrieve_rerank_simple_wikipedia.ipynb) process to find the best matches.
90
+ * Let [me](mailto:neal.caren@unc.edu) know what you think or it looks broken.
91
+ '''
92
+
93
+ st.markdown(notes)
94
+
95
+
96
+ def sent_trans_load():
97
+ #We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
98
+ bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
99
+ bi_encoder.max_seq_length = 256 #Truncate long passages to 256 tokens, max 512
100
+ return bi_encoder
101
+
102
+ def sent_cross_load():
103
+ #We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
104
+ cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
105
+ return cross_encoder
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+ with st.spinner(text="Loading embeddings..."):
114
+ corpus_embeddings = load_embeddings()
115
+
116
+
117
+
118
+
119
+
120
+ def search(query, top_k=50):
121
+
122
+ ##### Sematic Search #####
123
+ # Encode the query using the bi-encoder and find potentially relevant passages
124
+ question_embedding = bi_encoder.encode(query, convert_to_tensor=True).to(device)
125
+
126
+
127
+ hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
128
+ hits = hits[0] # Get the hits for the first query
129
+ ##### Re-Ranking #####
130
+ # Now, score all retrieved passages with the cross_encoder
131
+ cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
132
+ cross_scores = cross_encoder.predict(cross_inp)
133
+
134
+ # Sort results by the cross-encoder scores
135
+ for idx in range(len(cross_scores)):
136
+ hits[idx]['cross-score'] = cross_scores[idx]
137
+
138
+ # Output of top-5 hits from re-ranker
139
+ print("\n-------------------------\n")
140
+ print("Search Results")
141
+ hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
142
+
143
+ hd = OrderedDict()
144
+ for hit in hits[0:30]:
145
+
146
+ row_id = hit['corpus_id']
147
+ cite = df.loc[row_id]['cite']
148
+ #graph = passages[row_id]
149
+ graph = df.loc[row_id]['text']
150
+
151
+ # Find best sentence
152
+ ab_sentences= [s for s in sent_tokenize(graph)]
153
+ cross_inp = [[query, s] for s in ab_sentences]
154
+ cross_scores = cross_encoder.predict(cross_inp)
155
+ thesis = pd.Series(cross_scores, ab_sentences).sort_values().index[-1]
156
+ graph = graph.replace(thesis, f'**{thesis}**')
157
+
158
+ if cite in hd:
159
+
160
+ hd[cite].append(graph)
161
+ else:
162
+ hd[cite] = [graph]
163
+
164
+ for cite, graphs in hd.items():
165
+ cite = cite.replace(", ", '. "').replace(', Social ', '", Social ')
166
+ st.write(cite)
167
+
168
+ for graph in graphs[:5]:
169
+ # refind the Thesis
170
+ thesis = re.findall('\*\*(.*?)\*\*', graph)[0]
171
+
172
+ with st.expander(thesis):
173
+ st.write(f'> {graph}')
174
+ st.write('')
175
+ # print("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))
176
+
177
+
178
+
179
+ search_query = st.text_input('Enter your search phrase:')
180
+ if search_query!='':
181
+ with st.spinner(text="Searching and sorting results."):
182
+
183
+ placeholder = st.empty()
184
+ with placeholder.container():
185
+ st.image('https://www.dropbox.com/s/yndn6lkesjga9a6/emerac.png?raw=1')
186
+ bi_encoder = sent_trans_load()
187
+ cross_encoder = sent_cross_load()
188
+ search(search_query)
189
+ placeholder.empty()
emerac.png ADDED
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ sentence_transformers
2
+ torch
3
+ pandas
4
+ nltk
5
+ gdown