Spaces:

kaburia
/

policy-analysis

Sleeping

File size: 3,659 Bytes

import os
from huggingface_hub import InferenceClient, hf_hub_download
# from langchain.embeddings import HuggingFaceEmbeddings
# from langchain.vectorstores import FAISS
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# from langchain.text_splitter import RecursiveCharacterTextSplitter
import json
from pathlib import Path
import uuid
import json
from datetime import datetime
from huggingface_hub import HfApi

# Get the data logs path
data_logs_dir = os.path.join(
    Path(os.getcwd()).parent,
    'data-logs'
)


session_id = str(uuid.uuid4())  # Unique ID per launch
log_path = f"logs/session_{session_id}.jsonl"
os.makedirs("logs", exist_ok=True)

def log_interaction_hf(user_message, assistant_response):
    log_entry = {
        "timestamp": datetime.utcnow().isoformat(),
        "user": user_message,
        "assistant": assistant_response
    }
    with open(log_path, "a", encoding="utf-8") as f:
        f.write(json.dumps(log_entry) + "\n")


def upload_log_to_hf(token):
    api = HfApi(token)
    # log_in_repo = f"logs/session_{session_id}.jsonl"
    # api.upload_file(
    #     path_or_fileobj=log_path,
    #     path_in_repo=log_in_repo,
    #     repo_id="kaburia/chat-logs-policy-analysis",
    #     repo_type="dataset"
    # )
    api.upload_folder(
        folder_path="logs",
        path_in_repo=f"session_{session_id}",
        repo_id="kaburia/chat-logs-policy-analysis",
        repo_type="dataset",
    )

# === Step 0: Download FAISS index files if not present ===

def download_faiss_index(repo_id="kaburia/epic-a-embeddings", local_folder="faiss_index"):
    os.makedirs(local_folder, exist_ok=True)

    index_faiss_path = os.path.join(local_folder, "index.faiss")
    index_pkl_path = os.path.join(local_folder, "index.pkl")

    if not os.path.exists(index_faiss_path):
        print("Downloading index.faiss from Hugging Face Dataset...")
        hf_hub_download(
            repo_id=repo_id,
            filename="index.faiss",
            repo_type="dataset",          # 🛑 MUST add this line
            local_dir=local_folder,
            local_dir_use_symlinks=False,
        )

    if not os.path.exists(index_pkl_path):
        print("Downloading index.pkl from Hugging Face Dataset...")
        hf_hub_download(
            repo_id=repo_id,
            filename="index.pkl",
            repo_type="dataset",          # 🛑 MUST add this line
            local_dir=local_folder,
            local_dir_use_symlinks=False,
        )

# Download FAISS index if needed
download_faiss_index()


# === Step 1: Load Vectorstore ===

def load_vectorstore(index_path="faiss_index"):
    embedding_model = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )
    db = FAISS.load_local(
        index_path,
        embeddings=embedding_model,
        allow_dangerous_deserialization=True
    )
    return db

vectorstore = load_vectorstore()

def cosine_to_prob(score):
    # Convert cosine similarity from [-1, 1] to [0, 1]
    return (score + 1) / 2

def retrieve_context(question, p=5, threshold=0.5):
    # Get docs with raw scores
    results = vectorstore.similarity_search_with_score(question, k=50)  # get more than needed

    # Filter for "probability" above threshold
    filtered = [(doc, score) for doc, score in results if cosine_to_prob(score) > threshold]

    # Sort by score descending and take top-p
    top_p_docs = sorted(filtered, key=lambda x: x[1], reverse=True)[:p]

    # Join content for prompt
    context = "\n\n".join(doc.page_content for doc, _ in top_p_docs)
    return context