Spaces:

kaburia
/

policy-analysis

Running

App Files Files Community

policy-analysis / utils /helpers.py

kaburia

langchain community embeddings

b82c89f 7 months ago

raw

history blame contribute delete

3.66 kB

	import os
	from huggingface_hub import InferenceClient, hf_hub_download
	# from langchain.embeddings import HuggingFaceEmbeddings
	# from langchain.vectorstores import FAISS
	from langchain_community.vectorstores import FAISS
	from langchain_community.embeddings import HuggingFaceEmbeddings

	# from langchain.text_splitter import RecursiveCharacterTextSplitter
	import json
	from pathlib import Path
	import uuid
	import json
	from datetime import datetime
	from huggingface_hub import HfApi

	# Get the data logs path
	data_logs_dir = os.path.join(
	Path(os.getcwd()).parent,
	'data-logs'
	)


	session_id = str(uuid.uuid4()) # Unique ID per launch
	log_path = f"logs/session_{session_id}.jsonl"
	os.makedirs("logs", exist_ok=True)

	def log_interaction_hf(user_message, assistant_response):
	log_entry = {
	"timestamp": datetime.utcnow().isoformat(),
	"user": user_message,
	"assistant": assistant_response
	}
	with open(log_path, "a", encoding="utf-8") as f:
	f.write(json.dumps(log_entry) + "\n")


	def upload_log_to_hf(token):
	api = HfApi(token)
	# log_in_repo = f"logs/session_{session_id}.jsonl"
	# api.upload_file(
	# path_or_fileobj=log_path,
	# path_in_repo=log_in_repo,
	# repo_id="kaburia/chat-logs-policy-analysis",
	# repo_type="dataset"
	# )
	api.upload_folder(
	folder_path="logs",
	path_in_repo=f"session_{session_id}",
	repo_id="kaburia/chat-logs-policy-analysis",
	repo_type="dataset",
	)

	# === Step 0: Download FAISS index files if not present ===

	def download_faiss_index(repo_id="kaburia/epic-a-embeddings", local_folder="faiss_index"):
	os.makedirs(local_folder, exist_ok=True)

	index_faiss_path = os.path.join(local_folder, "index.faiss")
	index_pkl_path = os.path.join(local_folder, "index.pkl")

	if not os.path.exists(index_faiss_path):
	print("Downloading index.faiss from Hugging Face Dataset...")
	hf_hub_download(
	repo_id=repo_id,
	filename="index.faiss",
	repo_type="dataset", # 🛑 MUST add this line
	local_dir=local_folder,
	local_dir_use_symlinks=False,
	)

	if not os.path.exists(index_pkl_path):
	print("Downloading index.pkl from Hugging Face Dataset...")
	hf_hub_download(
	repo_id=repo_id,
	filename="index.pkl",
	repo_type="dataset", # 🛑 MUST add this line
	local_dir=local_folder,
	local_dir_use_symlinks=False,
	)

	# Download FAISS index if needed
	download_faiss_index()


	# === Step 1: Load Vectorstore ===

	def load_vectorstore(index_path="faiss_index"):
	embedding_model = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-L6-v2"
	)
	db = FAISS.load_local(
	index_path,
	embeddings=embedding_model,
	allow_dangerous_deserialization=True
	)
	return db

	vectorstore = load_vectorstore()

	def cosine_to_prob(score):
	# Convert cosine similarity from [-1, 1] to [0, 1]
	return (score + 1) / 2

	def retrieve_context(question, p=5, threshold=0.5):
	# Get docs with raw scores
	results = vectorstore.similarity_search_with_score(question, k=50) # get more than needed

	# Filter for "probability" above threshold
	filtered = [(doc, score) for doc, score in results if cosine_to_prob(score) > threshold]

	# Sort by score descending and take top-p
	top_p_docs = sorted(filtered, key=lambda x: x[1], reverse=True)[:p]

	# Join content for prompt
	context = "\n\n".join(doc.page_content for doc, _ in top_p_docs)
	return context