Soha85 commited on
Commit
511e86c
·
verified ·
1 Parent(s): 91cc1cc

add HF token

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +9 -4
src/streamlit_app.py CHANGED
@@ -11,6 +11,11 @@ from sentence_transformers import CrossEncoder
11
  import pickle
12
  import chromadb
13
  from chromadb.utils import embedding_functions
 
 
 
 
 
14
  BASE_DIR = "/tmp/rag_app"
15
  os.makedirs(BASE_DIR, exist_ok=True)
16
  # Global variables
@@ -71,15 +76,15 @@ with tab2:
71
  chunks = [text_data[i:i+chunk_size] for i in range(0, len(text_data), chunk_size-overlap)]
72
 
73
  if embedding_choice == "SentencePiece":
74
- model = SentenceTransformer("all-MiniLM-L6-v2")
75
  embeddings = model.encode(chunks, batch_size=300)
76
  elif embedding_choice == "TF-IDF":
77
  vectorizer = TfidfVectorizer()
78
  embeddings = vectorizer.fit_transform(chunks).toarray()
79
  elif embedding_choice == "BERT":
80
  model_name = "bert-base-uncased"
81
- tokenizer = AutoTokenizer.from_pretrained(model_name)
82
- model = AutoModel.from_pretrained(model_name)
83
  embeddings = bert_encode(model,tokenizer,chunks)
84
 
85
  if index_choice == "FAISS":
@@ -190,7 +195,7 @@ with tab3:
190
  #display similarity score measure used by ReRank and illustrate what number of score means more similar and its range
191
  st.write("Reranking using Cross-ReRank (higher score means more relevance, and lower score means less relevance). " \
192
  "It is relative ranking (higher score = more relevant), not the absolute magnitude.")
193
- reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
194
  scores = [reranker.predict([(user_query, doc)])[0] for doc in retrieved_texts]
195
  st.subheader("Reranked scores:")
196
  for doc, score in zip(retrieved_texts, scores):
 
11
  import pickle
12
  import chromadb
13
  from chromadb.utils import embedding_functions
14
+
15
+ hf_token = os.getenv("HF_token") # read from Space secret
16
+ if hf_token:
17
+ os.environ["HUGGINGFACE_HUB_TOKEN"] = hf_token
18
+
19
  BASE_DIR = "/tmp/rag_app"
20
  os.makedirs(BASE_DIR, exist_ok=True)
21
  # Global variables
 
76
  chunks = [text_data[i:i+chunk_size] for i in range(0, len(text_data), chunk_size-overlap)]
77
 
78
  if embedding_choice == "SentencePiece":
79
+ model = SentenceTransformer("all-MiniLM-L6-v2",use_auth_token=hf_token)
80
  embeddings = model.encode(chunks, batch_size=300)
81
  elif embedding_choice == "TF-IDF":
82
  vectorizer = TfidfVectorizer()
83
  embeddings = vectorizer.fit_transform(chunks).toarray()
84
  elif embedding_choice == "BERT":
85
  model_name = "bert-base-uncased"
86
+ tokenizer = AutoTokenizer.from_pretrained(model_name,token=hf_token)
87
+ model = AutoModel.from_pretrained(model_name,token=hf_token)
88
  embeddings = bert_encode(model,tokenizer,chunks)
89
 
90
  if index_choice == "FAISS":
 
195
  #display similarity score measure used by ReRank and illustrate what number of score means more similar and its range
196
  st.write("Reranking using Cross-ReRank (higher score means more relevance, and lower score means less relevance). " \
197
  "It is relative ranking (higher score = more relevant), not the absolute magnitude.")
198
+ reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2",token=hf_token)
199
  scores = [reranker.predict([(user_query, doc)])[0] for doc in retrieved_texts]
200
  st.subheader("Reranked scores:")
201
  for doc, score in zip(retrieved_texts, scores):