Spaces:
Running
Running
| import os | |
| from huggingface_hub import InferenceClient, hf_hub_download | |
| # from langchain.embeddings import HuggingFaceEmbeddings | |
| # from langchain.vectorstores import FAISS | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| # from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| import json | |
| from pathlib import Path | |
| import uuid | |
| import json | |
| from datetime import datetime | |
| from huggingface_hub import HfApi | |
| # Get the data logs path | |
| data_logs_dir = os.path.join( | |
| Path(os.getcwd()).parent, | |
| 'data-logs' | |
| ) | |
| session_id = str(uuid.uuid4()) # Unique ID per launch | |
| log_path = f"logs/session_{session_id}.jsonl" | |
| os.makedirs("logs", exist_ok=True) | |
| def log_interaction_hf(user_message, assistant_response): | |
| log_entry = { | |
| "timestamp": datetime.utcnow().isoformat(), | |
| "user": user_message, | |
| "assistant": assistant_response | |
| } | |
| with open(log_path, "a", encoding="utf-8") as f: | |
| f.write(json.dumps(log_entry) + "\n") | |
| def upload_log_to_hf(token): | |
| api = HfApi(token) | |
| # log_in_repo = f"logs/session_{session_id}.jsonl" | |
| # api.upload_file( | |
| # path_or_fileobj=log_path, | |
| # path_in_repo=log_in_repo, | |
| # repo_id="kaburia/chat-logs-policy-analysis", | |
| # repo_type="dataset" | |
| # ) | |
| api.upload_folder( | |
| folder_path="logs", | |
| path_in_repo=f"session_{session_id}", | |
| repo_id="kaburia/chat-logs-policy-analysis", | |
| repo_type="dataset", | |
| ) | |
| # === Step 0: Download FAISS index files if not present === | |
| def download_faiss_index(repo_id="kaburia/epic-a-embeddings", local_folder="faiss_index"): | |
| os.makedirs(local_folder, exist_ok=True) | |
| index_faiss_path = os.path.join(local_folder, "index.faiss") | |
| index_pkl_path = os.path.join(local_folder, "index.pkl") | |
| if not os.path.exists(index_faiss_path): | |
| print("Downloading index.faiss from Hugging Face Dataset...") | |
| hf_hub_download( | |
| repo_id=repo_id, | |
| filename="index.faiss", | |
| repo_type="dataset", # π MUST add this line | |
| local_dir=local_folder, | |
| local_dir_use_symlinks=False, | |
| ) | |
| if not os.path.exists(index_pkl_path): | |
| print("Downloading index.pkl from Hugging Face Dataset...") | |
| hf_hub_download( | |
| repo_id=repo_id, | |
| filename="index.pkl", | |
| repo_type="dataset", # π MUST add this line | |
| local_dir=local_folder, | |
| local_dir_use_symlinks=False, | |
| ) | |
| # Download FAISS index if needed | |
| download_faiss_index() | |
| # === Step 1: Load Vectorstore === | |
| def load_vectorstore(index_path="faiss_index"): | |
| embedding_model = HuggingFaceEmbeddings( | |
| model_name="sentence-transformers/all-MiniLM-L6-v2" | |
| ) | |
| db = FAISS.load_local( | |
| index_path, | |
| embeddings=embedding_model, | |
| allow_dangerous_deserialization=True | |
| ) | |
| return db | |
| vectorstore = load_vectorstore() | |
| def cosine_to_prob(score): | |
| # Convert cosine similarity from [-1, 1] to [0, 1] | |
| return (score + 1) / 2 | |
| def retrieve_context(question, p=5, threshold=0.5): | |
| # Get docs with raw scores | |
| results = vectorstore.similarity_search_with_score(question, k=50) # get more than needed | |
| # Filter for "probability" above threshold | |
| filtered = [(doc, score) for doc, score in results if cosine_to_prob(score) > threshold] | |
| # Sort by score descending and take top-p | |
| top_p_docs = sorted(filtered, key=lambda x: x[1], reverse=True)[:p] | |
| # Join content for prompt | |
| context = "\n\n".join(doc.page_content for doc, _ in top_p_docs) | |
| return context | |