hamza50 commited on
Commit
b2b5a46
·
0 Parent(s):

Duplicate from hamza50/cicero_semantic_search

Browse files
Files changed (5) hide show
  1. .gitattributes +34 -0
  2. README.md +13 -0
  3. app.py +79 -0
  4. entire_data.pkl +3 -0
  5. requirements.txt +20 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Cicero Semantic Search
3
+ emoji: 🐢
4
+ colorFrom: green
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 3.23.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: hamza50/cicero_semantic_search
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import tiktoken
4
+ import pandas as pd
5
+ import time
6
+ import spacy
7
+ from spacy.lang.en.stop_words import STOP_WORDS
8
+ from string import punctuation
9
+ from collections import Counter
10
+ from heapq import nlargest
11
+ import nltk
12
+ import numpy as np
13
+ from tqdm import tqdm
14
+ from sentence_transformers import SentenceTransformer, util
15
+ from sentence_transformers import SentenceTransformer, CrossEncoder, util
16
+ import gzip
17
+ import os
18
+ import torch
19
+ import re
20
+
21
+ from openai.embeddings_utils import get_embedding, cosine_similarity
22
+ import os
23
+
24
+
25
+
26
+ df = pd.read_pickle('entire_data.pkl') #to load 123.pkl back to the dataframe df
27
+ model = SentenceTransformer('all-mpnet-base-v2')
28
+
29
+ def remove_html_tags(text):
30
+ clean = re.compile('<.*?>')
31
+ return re.sub(clean, '', text)
32
+
33
+ df['content'] = df.content.apply(lambda x: remove_html_tags(x))
34
+ df['summary_html'] = df.summary_html.apply(lambda x: remove_html_tags(x))
35
+
36
+ def search(query):
37
+ n = 10
38
+ query_embedding = model.encode(query)
39
+ df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, query_embedding.reshape(768,-1)))
40
+
41
+ results = (df.sort_values("similarity", ascending=False).head(n))
42
+ r_groupby = pd.DataFrame(results.groupby(['title','url','keywords','summary_html']).similarity.max())
43
+ #results = results[['title','url','keywords','summary_html']].drop_duplicates()
44
+ results = r_groupby.reset_index()
45
+ results = results.sort_values("similarity", ascending=False)
46
+ resultlist = []
47
+ for r in results.index:
48
+ resultlist.append(
49
+ {
50
+ "Title":results.title[r],
51
+ "url":results.url[r],
52
+ "score": str(results.similarity[r][0]),
53
+ "summary": results.summary_html[r][:200],
54
+ "keywords": results.keywords[r]
55
+ }
56
+ )
57
+
58
+ return resultlist
59
+
60
+ def greet(query):
61
+
62
+ bm25 = search(query)
63
+ return bm25
64
+
65
+ examples = [
66
+ ["Climate Change Challenges in Europe"],
67
+ ["Philosophy in the world of Minimalism"],
68
+ ["Hate Speech vs Freedom of Speech"],
69
+ ["Articles by Noam Chomsky on US Politics"],
70
+ ["The importance of values and reflection"]
71
+ ]
72
+
73
+ demo = gr.Interface(fn=greet, title="cicero-semantic-search",
74
+ inputs=gr.inputs.Textbox(lines=5, label="what would you like to learn about?"),
75
+ outputs="json",examples=examples)
76
+
77
+ demo.launch()
78
+
79
+
entire_data.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d719ff7c8e72ee0f56541a05b3eac5241adb7f19c7237ac3d6546af12f6dde22
3
+ size 51891614
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ scipy
3
+ tqdm
4
+ gensim
5
+ plotly
6
+ scikit-learn
7
+ numpy
8
+ wordcloud
9
+ matplotlib
10
+ openai
11
+ langchain
12
+ faiss-cpu
13
+ tiktoken
14
+ sentence_transformers
15
+ scipy
16
+ tqdm
17
+ matplotlib
18
+ spacy
19
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl
20
+ rank-bm25