add HF token
Browse files- src/streamlit_app.py +9 -4
src/streamlit_app.py
CHANGED
|
@@ -11,6 +11,11 @@ from sentence_transformers import CrossEncoder
|
|
| 11 |
import pickle
|
| 12 |
import chromadb
|
| 13 |
from chromadb.utils import embedding_functions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
BASE_DIR = "/tmp/rag_app"
|
| 15 |
os.makedirs(BASE_DIR, exist_ok=True)
|
| 16 |
# Global variables
|
|
@@ -71,15 +76,15 @@ with tab2:
|
|
| 71 |
chunks = [text_data[i:i+chunk_size] for i in range(0, len(text_data), chunk_size-overlap)]
|
| 72 |
|
| 73 |
if embedding_choice == "SentencePiece":
|
| 74 |
-
model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 75 |
embeddings = model.encode(chunks, batch_size=300)
|
| 76 |
elif embedding_choice == "TF-IDF":
|
| 77 |
vectorizer = TfidfVectorizer()
|
| 78 |
embeddings = vectorizer.fit_transform(chunks).toarray()
|
| 79 |
elif embedding_choice == "BERT":
|
| 80 |
model_name = "bert-base-uncased"
|
| 81 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 82 |
-
model = AutoModel.from_pretrained(model_name)
|
| 83 |
embeddings = bert_encode(model,tokenizer,chunks)
|
| 84 |
|
| 85 |
if index_choice == "FAISS":
|
|
@@ -190,7 +195,7 @@ with tab3:
|
|
| 190 |
#display similarity score measure used by ReRank and illustrate what number of score means more similar and its range
|
| 191 |
st.write("Reranking using Cross-ReRank (higher score means more relevance, and lower score means less relevance). " \
|
| 192 |
"It is relative ranking (higher score = more relevant), not the absolute magnitude.")
|
| 193 |
-
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
|
| 194 |
scores = [reranker.predict([(user_query, doc)])[0] for doc in retrieved_texts]
|
| 195 |
st.subheader("Reranked scores:")
|
| 196 |
for doc, score in zip(retrieved_texts, scores):
|
|
|
|
| 11 |
import pickle
|
| 12 |
import chromadb
|
| 13 |
from chromadb.utils import embedding_functions
|
| 14 |
+
|
| 15 |
+
hf_token = os.getenv("HF_token") # read from Space secret
|
| 16 |
+
if hf_token:
|
| 17 |
+
os.environ["HUGGINGFACE_HUB_TOKEN"] = hf_token
|
| 18 |
+
|
| 19 |
BASE_DIR = "/tmp/rag_app"
|
| 20 |
os.makedirs(BASE_DIR, exist_ok=True)
|
| 21 |
# Global variables
|
|
|
|
| 76 |
chunks = [text_data[i:i+chunk_size] for i in range(0, len(text_data), chunk_size-overlap)]
|
| 77 |
|
| 78 |
if embedding_choice == "SentencePiece":
|
| 79 |
+
model = SentenceTransformer("all-MiniLM-L6-v2",use_auth_token=hf_token)
|
| 80 |
embeddings = model.encode(chunks, batch_size=300)
|
| 81 |
elif embedding_choice == "TF-IDF":
|
| 82 |
vectorizer = TfidfVectorizer()
|
| 83 |
embeddings = vectorizer.fit_transform(chunks).toarray()
|
| 84 |
elif embedding_choice == "BERT":
|
| 85 |
model_name = "bert-base-uncased"
|
| 86 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name,token=hf_token)
|
| 87 |
+
model = AutoModel.from_pretrained(model_name,token=hf_token)
|
| 88 |
embeddings = bert_encode(model,tokenizer,chunks)
|
| 89 |
|
| 90 |
if index_choice == "FAISS":
|
|
|
|
| 195 |
#display similarity score measure used by ReRank and illustrate what number of score means more similar and its range
|
| 196 |
st.write("Reranking using Cross-ReRank (higher score means more relevance, and lower score means less relevance). " \
|
| 197 |
"It is relative ranking (higher score = more relevant), not the absolute magnitude.")
|
| 198 |
+
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2",token=hf_token)
|
| 199 |
scores = [reranker.predict([(user_query, doc)])[0] for doc in retrieved_texts]
|
| 200 |
st.subheader("Reranked scores:")
|
| 201 |
for doc, score in zip(retrieved_texts, scores):
|