policy-analysis / utils /helpers.py
kaburia's picture
langchain community embeddings
b82c89f
import os
from huggingface_hub import InferenceClient, hf_hub_download
# from langchain.embeddings import HuggingFaceEmbeddings
# from langchain.vectorstores import FAISS
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
# from langchain.text_splitter import RecursiveCharacterTextSplitter
import json
from pathlib import Path
import uuid
import json
from datetime import datetime
from huggingface_hub import HfApi
# Get the data logs path
data_logs_dir = os.path.join(
Path(os.getcwd()).parent,
'data-logs'
)
session_id = str(uuid.uuid4()) # Unique ID per launch
log_path = f"logs/session_{session_id}.jsonl"
os.makedirs("logs", exist_ok=True)
def log_interaction_hf(user_message, assistant_response):
log_entry = {
"timestamp": datetime.utcnow().isoformat(),
"user": user_message,
"assistant": assistant_response
}
with open(log_path, "a", encoding="utf-8") as f:
f.write(json.dumps(log_entry) + "\n")
def upload_log_to_hf(token):
api = HfApi(token)
# log_in_repo = f"logs/session_{session_id}.jsonl"
# api.upload_file(
# path_or_fileobj=log_path,
# path_in_repo=log_in_repo,
# repo_id="kaburia/chat-logs-policy-analysis",
# repo_type="dataset"
# )
api.upload_folder(
folder_path="logs",
path_in_repo=f"session_{session_id}",
repo_id="kaburia/chat-logs-policy-analysis",
repo_type="dataset",
)
# === Step 0: Download FAISS index files if not present ===
def download_faiss_index(repo_id="kaburia/epic-a-embeddings", local_folder="faiss_index"):
os.makedirs(local_folder, exist_ok=True)
index_faiss_path = os.path.join(local_folder, "index.faiss")
index_pkl_path = os.path.join(local_folder, "index.pkl")
if not os.path.exists(index_faiss_path):
print("Downloading index.faiss from Hugging Face Dataset...")
hf_hub_download(
repo_id=repo_id,
filename="index.faiss",
repo_type="dataset", # πŸ›‘ MUST add this line
local_dir=local_folder,
local_dir_use_symlinks=False,
)
if not os.path.exists(index_pkl_path):
print("Downloading index.pkl from Hugging Face Dataset...")
hf_hub_download(
repo_id=repo_id,
filename="index.pkl",
repo_type="dataset", # πŸ›‘ MUST add this line
local_dir=local_folder,
local_dir_use_symlinks=False,
)
# Download FAISS index if needed
download_faiss_index()
# === Step 1: Load Vectorstore ===
def load_vectorstore(index_path="faiss_index"):
embedding_model = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
db = FAISS.load_local(
index_path,
embeddings=embedding_model,
allow_dangerous_deserialization=True
)
return db
vectorstore = load_vectorstore()
def cosine_to_prob(score):
# Convert cosine similarity from [-1, 1] to [0, 1]
return (score + 1) / 2
def retrieve_context(question, p=5, threshold=0.5):
# Get docs with raw scores
results = vectorstore.similarity_search_with_score(question, k=50) # get more than needed
# Filter for "probability" above threshold
filtered = [(doc, score) for doc, score in results if cosine_to_prob(score) > threshold]
# Sort by score descending and take top-p
top_p_docs = sorted(filtered, key=lambda x: x[1], reverse=True)[:p]
# Join content for prompt
context = "\n\n".join(doc.page_content for doc, _ in top_p_docs)
return context