smhunt

Build error

App Files Files Community

NealCaren commited on Feb 8, 2023

Commit

35d8ab8

0 Parent(s):

Duplicate from NealCaren/paragraphs

Browse files

Files changed (5) hide show

.gitattributes +36 -0
README.md +13 -0
app.py +189 -0
emerac.png +0 -0
requirements.txt +5 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+passages_0.jsonl filter=lfs diff=lfs merge=lfs -text
+passages_4.jsonl filter=lfs diff=lfs merge=lfs -text
+passages_1.jsonl filter=lfs diff=lfs merge=lfs -text
+passages_2.jsonl filter=lfs diff=lfs merge=lfs -text
+passages_3.jsonl filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Paragraphs
+emoji: 😻
+colorFrom: gray
+colorTo: green
+sdk: streamlit
+sdk_version: 1.10.0
+app_file: app.py
+pinned: false
+duplicated_from: NealCaren/paragraphs
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import streamlit as st
+import numpy as np
+import re
+import pickle
+from collections import OrderedDict
+import io
+from sentence_transformers import SentenceTransformer, CrossEncoder, util
+import torch
+from nltk.tokenize import sent_tokenize
+import nltk
+import gdown
+import requests
+from PIL import Image
+# Trying to figure out some CSS stuff
+st.markdown(
+    """
+<style>
+.streamlit-expanderHeader {
+    font-size: medium;
+}
+</style>
+""",
+    unsafe_allow_html=True,
+)
+nltk.download('punkt')
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+import pandas as pd
+purl = st.secrets["graphs_url"]
+print(purl)
+@st.cache
+def load_embeddings():
+    url = "https://drive.google.com/uc?export=download&id=1z9eoBI07p_YtrdK1ZWZeCRT5T5mu5nhV"
+    output = "embeddings.npy"
+    gdown.download(url, output, quiet=False)
+    corpus_embeddings = np.load(output)
+    return corpus_embeddings
+@st.cache
+def load_data(url):
+    #url = "https://drive.google.com/uc?export=download&id=1nIBS9is8YCeiPBqA7MifVC5xeaKWH8uL"
+    output = "passages.jsonl"
+    gdown.download(url, output, quiet=False)
+    df = pd.read_json(output, lines=True)
+    df.reset_index(inplace=True, drop=True)
+    return df
+st.title('Sociology EMERAC')
+st.write('This project is a work-in-progress that searches the text of recently-published articles from a few sociology journals and retrieves the most relevant paragraphs.')
+with st.spinner(text="Loading data..."):
+    df = load_data(purl)
+    passages = df['text'].values
+no_of_graphs=len(df)
+no_of_articles = len(df['cite'].value_counts())
+notes = f'''Notes:
+* I have found three types of searches work best:
+     * Phrases or specific topics, such as "inequality in latin america", "race color skin tone measurement", "audit study experiment gender",  or "logistic regression or linear probability model".
+     * Citations to well-known works, either using author year ("bourdieu 1984") or author idea ("Crenshaw intersectionality")
+    * Questions, like "What is a topic model?" or "How did Weber define bureaucracy?"
+* The search expands beyond exact matching, so "asia social movements" may return paragraphs on Asian-Americans politics and South Korean labor unions.
+* The first search can take up to 10 seconds as the files load. After that, it's quicker to respond.
+* The most relevant paragraph to your search is returned first, along with up to four other related paragraphs from that article.
+* The most relevant sentence within each paragraph, as determined by math, is displayed. Click on it to see the full paragraph.
+* The results are not exhaustive, and seem to drift off even when you suspect there are more relevant articles :man-shrugging:.
+* The dataset currently includes {no_of_graphs:,} paragraphs from {no_of_articles:,} published in the last five years in *Mobilization*, *Social Forces*, *Social Problems*, *Sociology of Race and Ethnicity*, *Gender and Society*, *Socius*, *JHSB*, *Annual Review of Sociology*, and the *American Sociological Review*.
+* Behind the scenes, the semantic search uses [text embeddings](https://www.sbert.net) with a [retrieve & re-rank](https://colab.research.google.com/github/UKPLab/sentence-transformers/blob/master/examples/applications/retrieve_rerank/retrieve_rerank_simple_wikipedia.ipynb) process to find the best matches.
+* Let [me](mailto:neal.caren@unc.edu) know what you think or it looks broken.
+'''
+st.markdown(notes)
+def sent_trans_load():
+    #We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
+    bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
+    bi_encoder.max_seq_length = 256     #Truncate long passages to 256 tokens, max 512
+    return bi_encoder
+def sent_cross_load():
+    #We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
+    cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
+    return cross_encoder
+with st.spinner(text="Loading embeddings..."):
+    corpus_embeddings = load_embeddings()
+def search(query, top_k=50):
+    ##### Sematic Search #####
+    # Encode the query using the bi-encoder and find potentially relevant passages
+    question_embedding = bi_encoder.encode(query, convert_to_tensor=True).to(device)
+    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
+    hits = hits[0]  # Get the hits for the first query
+    ##### Re-Ranking #####
+    # Now, score all retrieved passages with the cross_encoder
+    cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
+    cross_scores = cross_encoder.predict(cross_inp)
+    # Sort results by the cross-encoder scores
+    for idx in range(len(cross_scores)):
+        hits[idx]['cross-score'] = cross_scores[idx]
+    # Output of top-5 hits from re-ranker
+    print("\n-------------------------\n")
+    print("Search Results")
+    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
+    hd = OrderedDict()
+    for hit in hits[0:30]:
+        row_id = hit['corpus_id']
+        cite = df.loc[row_id]['cite']
+        #graph = passages[row_id]
+        graph = df.loc[row_id]['text']
+        # Find best sentence
+        ab_sentences= [s for s in sent_tokenize(graph)]
+        cross_inp = [[query, s] for s in ab_sentences]
+        cross_scores = cross_encoder.predict(cross_inp)
+        thesis = pd.Series(cross_scores, ab_sentences).sort_values().index[-1]
+        graph = graph.replace(thesis, f'**{thesis}**')
+        if cite in hd:
+          hd[cite].append(graph)
+        else:
+          hd[cite] = [graph]
+    for cite, graphs in hd.items():
+        cite = cite.replace(",  ", '. "').replace(', Social ', '", Social ')
+        st.write(cite)
+        for graph in graphs[:5]:
+          # refind the Thesis
+          thesis = re.findall('\*\*(.*?)\*\*', graph)[0]
+          with st.expander(thesis):
+             st.write(f'> {graph}')
+        st.write('')
+       # print("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))
+search_query = st.text_input('Enter your search phrase:')
+if search_query!='':
+    with st.spinner(text="Searching and sorting results."):
+        placeholder = st.empty()
+        with placeholder.container():
+            st.image('https://www.dropbox.com/s/yndn6lkesjga9a6/emerac.png?raw=1')
+        bi_encoder = sent_trans_load()
+        cross_encoder = sent_cross_load()
+        search(search_query)
+        placeholder.empty()

emerac.png ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+sentence_transformers
+torch
+pandas
+nltk
+gdown