Spaces:
Runtime error
Runtime error
MarlonKegel commited on
Commit ·
88f9ba1
1
Parent(s): 7c7a313
using pre-tokenized chunks
Browse files- .DS_Store +0 -0
- Dockerfile +0 -3
- rag_ui.py +5 -15
- requirements.txt +1 -2
.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
Dockerfile
CHANGED
|
@@ -6,9 +6,6 @@ COPY requirements.txt .
|
|
| 6 |
RUN pip install --upgrade pip
|
| 7 |
RUN pip install -r requirements.txt
|
| 8 |
|
| 9 |
-
# Download the spaCy English model
|
| 10 |
-
RUN python -m spacy download en_core_web_sm
|
| 11 |
-
|
| 12 |
COPY . .
|
| 13 |
|
| 14 |
# Let Streamlit accept connections from everywhere, on port 7860 (HF Spaces default)
|
|
|
|
| 6 |
RUN pip install --upgrade pip
|
| 7 |
RUN pip install -r requirements.txt
|
| 8 |
|
|
|
|
|
|
|
|
|
|
| 9 |
COPY . .
|
| 10 |
|
| 11 |
# Let Streamlit accept connections from everywhere, on port 7860 (HF Spaces default)
|
rag_ui.py
CHANGED
|
@@ -21,8 +21,6 @@ from huggingface_hub import hf_hub_download
|
|
| 21 |
from rank_bm25 import BM25Okapi
|
| 22 |
import io
|
| 23 |
from docx import Document
|
| 24 |
-
import spacy
|
| 25 |
-
from functools import lru_cache
|
| 26 |
import hashlib
|
| 27 |
|
| 28 |
# Caching for search results function
|
|
@@ -31,16 +29,9 @@ def cached_search(query, chunk_idx_pool_tuple, n_final):
|
|
| 31 |
return hybrid_search(query, chunk_idx_pool=list(chunk_idx_pool_tuple) if chunk_idx_pool_tuple else None, n_final=n_final)
|
| 32 |
|
| 33 |
############### TOKENIZER AND NORM FUNCTION ##############
|
| 34 |
-
nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser']) # fast, enough for tokenization
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
# spaCy removes punctuation, does lemmatization, drops stopwords
|
| 39 |
-
doc = nlp(text.lower())
|
| 40 |
-
return [tok.lemma_ for tok in doc if tok.is_alpha and not tok.is_stop]
|
| 41 |
-
|
| 42 |
-
def tokenize(text):
|
| 43 |
-
return _spacy_tokenize(text)
|
| 44 |
|
| 45 |
def l2_normalize(vecs, axis=1, epsilon=1e-10):
|
| 46 |
norms = np.linalg.norm(vecs, ord=2, axis=axis, keepdims=True)
|
|
@@ -86,9 +77,8 @@ def load_search_data():
|
|
| 86 |
fout.write(fidx.read())
|
| 87 |
faiss_index = faiss.read_index("/tmp/zotero_chunks.index")
|
| 88 |
|
| 89 |
-
#
|
| 90 |
-
|
| 91 |
-
tokenized_texts = [tokenize(text) for text in texts]
|
| 92 |
bm25 = BM25Okapi(tokenized_texts)
|
| 93 |
|
| 94 |
return chunks, faiss_index, bm25
|
|
@@ -124,7 +114,7 @@ source_key_map = dict(zip(source_labels, sources_sorted)) # Map label to (author
|
|
| 124 |
|
| 125 |
########### BM25-BASED SPARSE SEARCH ###########
|
| 126 |
def sparse_search(query, chunk_idx_pool=None, k=TOPK_SPARSE):
|
| 127 |
-
query_tokens =
|
| 128 |
if chunk_idx_pool is None:
|
| 129 |
scores = bm25.get_scores(query_tokens)
|
| 130 |
idxs = np.argsort(scores)[::-1][:k]
|
|
|
|
| 21 |
from rank_bm25 import BM25Okapi
|
| 22 |
import io
|
| 23 |
from docx import Document
|
|
|
|
|
|
|
| 24 |
import hashlib
|
| 25 |
|
| 26 |
# Caching for search results function
|
|
|
|
| 29 |
return hybrid_search(query, chunk_idx_pool=list(chunk_idx_pool_tuple) if chunk_idx_pool_tuple else None, n_final=n_final)
|
| 30 |
|
| 31 |
############### TOKENIZER AND NORM FUNCTION ##############
|
|
|
|
| 32 |
|
| 33 |
+
def query_tokenize(text):
|
| 34 |
+
return re.findall(r"\w+", text.lower())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
def l2_normalize(vecs, axis=1, epsilon=1e-10):
|
| 37 |
norms = np.linalg.norm(vecs, ord=2, axis=axis, keepdims=True)
|
|
|
|
| 77 |
fout.write(fidx.read())
|
| 78 |
faiss_index = faiss.read_index("/tmp/zotero_chunks.index")
|
| 79 |
|
| 80 |
+
# get tokens for BM25
|
| 81 |
+
tokenized_texts = [c["tokens"] for c in chunks]
|
|
|
|
| 82 |
bm25 = BM25Okapi(tokenized_texts)
|
| 83 |
|
| 84 |
return chunks, faiss_index, bm25
|
|
|
|
| 114 |
|
| 115 |
########### BM25-BASED SPARSE SEARCH ###########
|
| 116 |
def sparse_search(query, chunk_idx_pool=None, k=TOPK_SPARSE):
|
| 117 |
+
query_tokens = query_tokenize(query)
|
| 118 |
if chunk_idx_pool is None:
|
| 119 |
scores = bm25.get_scores(query_tokens)
|
| 120 |
idxs = np.argsort(scores)[::-1][:k]
|
requirements.txt
CHANGED
|
@@ -7,5 +7,4 @@ tqdm
|
|
| 7 |
huggingface_hub
|
| 8 |
rank_bm25
|
| 9 |
python-docx
|
| 10 |
-
fpdf2
|
| 11 |
-
spacy
|
|
|
|
| 7 |
huggingface_hub
|
| 8 |
rank_bm25
|
| 9 |
python-docx
|
| 10 |
+
fpdf2
|
|
|