import os from huggingface_hub import InferenceClient, hf_hub_download # from langchain.embeddings import HuggingFaceEmbeddings # from langchain.vectorstores import FAISS from langchain_community.vectorstores import FAISS from langchain_community.embeddings import HuggingFaceEmbeddings # from langchain.text_splitter import RecursiveCharacterTextSplitter import json from pathlib import Path import uuid import json from datetime import datetime from huggingface_hub import HfApi # Get the data logs path data_logs_dir = os.path.join( Path(os.getcwd()).parent, 'data-logs' ) session_id = str(uuid.uuid4()) # Unique ID per launch log_path = f"logs/session_{session_id}.jsonl" os.makedirs("logs", exist_ok=True) def log_interaction_hf(user_message, assistant_response): log_entry = { "timestamp": datetime.utcnow().isoformat(), "user": user_message, "assistant": assistant_response } with open(log_path, "a", encoding="utf-8") as f: f.write(json.dumps(log_entry) + "\n") def upload_log_to_hf(token): api = HfApi(token) # log_in_repo = f"logs/session_{session_id}.jsonl" # api.upload_file( # path_or_fileobj=log_path, # path_in_repo=log_in_repo, # repo_id="kaburia/chat-logs-policy-analysis", # repo_type="dataset" # ) api.upload_folder( folder_path="logs", path_in_repo=f"session_{session_id}", repo_id="kaburia/chat-logs-policy-analysis", repo_type="dataset", ) # === Step 0: Download FAISS index files if not present === def download_faiss_index(repo_id="kaburia/epic-a-embeddings", local_folder="faiss_index"): os.makedirs(local_folder, exist_ok=True) index_faiss_path = os.path.join(local_folder, "index.faiss") index_pkl_path = os.path.join(local_folder, "index.pkl") if not os.path.exists(index_faiss_path): print("Downloading index.faiss from Hugging Face Dataset...") hf_hub_download( repo_id=repo_id, filename="index.faiss", repo_type="dataset", # 🛑 MUST add this line local_dir=local_folder, local_dir_use_symlinks=False, ) if not os.path.exists(index_pkl_path): print("Downloading index.pkl from Hugging Face Dataset...") hf_hub_download( repo_id=repo_id, filename="index.pkl", repo_type="dataset", # 🛑 MUST add this line local_dir=local_folder, local_dir_use_symlinks=False, ) # Download FAISS index if needed download_faiss_index() # === Step 1: Load Vectorstore === def load_vectorstore(index_path="faiss_index"): embedding_model = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2" ) db = FAISS.load_local( index_path, embeddings=embedding_model, allow_dangerous_deserialization=True ) return db vectorstore = load_vectorstore() def cosine_to_prob(score): # Convert cosine similarity from [-1, 1] to [0, 1] return (score + 1) / 2 def retrieve_context(question, p=5, threshold=0.5): # Get docs with raw scores results = vectorstore.similarity_search_with_score(question, k=50) # get more than needed # Filter for "probability" above threshold filtered = [(doc, score) for doc, score in results if cosine_to_prob(score) > threshold] # Sort by score descending and take top-p top_p_docs = sorted(filtered, key=lambda x: x[1], reverse=True)[:p] # Join content for prompt context = "\n\n".join(doc.page_content for doc, _ in top_p_docs) return context