MarlonKegel commited on
Commit
88f9ba1
·
1 Parent(s): 7c7a313

using pre-tokenized chunks

Browse files
Files changed (4) hide show
  1. .DS_Store +0 -0
  2. Dockerfile +0 -3
  3. rag_ui.py +5 -15
  4. requirements.txt +1 -2
.DS_Store ADDED
Binary file (6.15 kB). View file
 
Dockerfile CHANGED
@@ -6,9 +6,6 @@ COPY requirements.txt .
6
  RUN pip install --upgrade pip
7
  RUN pip install -r requirements.txt
8
 
9
- # Download the spaCy English model
10
- RUN python -m spacy download en_core_web_sm
11
-
12
  COPY . .
13
 
14
  # Let Streamlit accept connections from everywhere, on port 7860 (HF Spaces default)
 
6
  RUN pip install --upgrade pip
7
  RUN pip install -r requirements.txt
8
 
 
 
 
9
  COPY . .
10
 
11
  # Let Streamlit accept connections from everywhere, on port 7860 (HF Spaces default)
rag_ui.py CHANGED
@@ -21,8 +21,6 @@ from huggingface_hub import hf_hub_download
21
  from rank_bm25 import BM25Okapi
22
  import io
23
  from docx import Document
24
- import spacy
25
- from functools import lru_cache
26
  import hashlib
27
 
28
  # Caching for search results function
@@ -31,16 +29,9 @@ def cached_search(query, chunk_idx_pool_tuple, n_final):
31
  return hybrid_search(query, chunk_idx_pool=list(chunk_idx_pool_tuple) if chunk_idx_pool_tuple else None, n_final=n_final)
32
 
33
  ############### TOKENIZER AND NORM FUNCTION ##############
34
- nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser']) # fast, enough for tokenization
35
 
36
- @lru_cache(maxsize=2048)
37
- def _spacy_tokenize(text):
38
- # spaCy removes punctuation, does lemmatization, drops stopwords
39
- doc = nlp(text.lower())
40
- return [tok.lemma_ for tok in doc if tok.is_alpha and not tok.is_stop]
41
-
42
- def tokenize(text):
43
- return _spacy_tokenize(text)
44
 
45
  def l2_normalize(vecs, axis=1, epsilon=1e-10):
46
  norms = np.linalg.norm(vecs, ord=2, axis=axis, keepdims=True)
@@ -86,9 +77,8 @@ def load_search_data():
86
  fout.write(fidx.read())
87
  faiss_index = faiss.read_index("/tmp/zotero_chunks.index")
88
 
89
- # Tokenize all texts for BM25
90
- texts = [c["text"] for c in chunks][:100]
91
- tokenized_texts = [tokenize(text) for text in texts]
92
  bm25 = BM25Okapi(tokenized_texts)
93
 
94
  return chunks, faiss_index, bm25
@@ -124,7 +114,7 @@ source_key_map = dict(zip(source_labels, sources_sorted)) # Map label to (author
124
 
125
  ########### BM25-BASED SPARSE SEARCH ###########
126
  def sparse_search(query, chunk_idx_pool=None, k=TOPK_SPARSE):
127
- query_tokens = tokenize(query)
128
  if chunk_idx_pool is None:
129
  scores = bm25.get_scores(query_tokens)
130
  idxs = np.argsort(scores)[::-1][:k]
 
21
  from rank_bm25 import BM25Okapi
22
  import io
23
  from docx import Document
 
 
24
  import hashlib
25
 
26
  # Caching for search results function
 
29
  return hybrid_search(query, chunk_idx_pool=list(chunk_idx_pool_tuple) if chunk_idx_pool_tuple else None, n_final=n_final)
30
 
31
  ############### TOKENIZER AND NORM FUNCTION ##############
 
32
 
33
+ def query_tokenize(text):
34
+ return re.findall(r"\w+", text.lower())
 
 
 
 
 
 
35
 
36
  def l2_normalize(vecs, axis=1, epsilon=1e-10):
37
  norms = np.linalg.norm(vecs, ord=2, axis=axis, keepdims=True)
 
77
  fout.write(fidx.read())
78
  faiss_index = faiss.read_index("/tmp/zotero_chunks.index")
79
 
80
+ # get tokens for BM25
81
+ tokenized_texts = [c["tokens"] for c in chunks]
 
82
  bm25 = BM25Okapi(tokenized_texts)
83
 
84
  return chunks, faiss_index, bm25
 
114
 
115
  ########### BM25-BASED SPARSE SEARCH ###########
116
  def sparse_search(query, chunk_idx_pool=None, k=TOPK_SPARSE):
117
+ query_tokens = query_tokenize(query)
118
  if chunk_idx_pool is None:
119
  scores = bm25.get_scores(query_tokens)
120
  idxs = np.argsort(scores)[::-1][:k]
requirements.txt CHANGED
@@ -7,5 +7,4 @@ tqdm
7
  huggingface_hub
8
  rank_bm25
9
  python-docx
10
- fpdf2
11
- spacy
 
7
  huggingface_hub
8
  rank_bm25
9
  python-docx
10
+ fpdf2