File size: 3,659 Bytes
f220545
 
8d29351
 
 
b82c89f
8d29351
f220545
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28d69ea
 
 
 
 
 
 
 
 
3a9193b
 
28d69ea
f220545
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os
from huggingface_hub import InferenceClient, hf_hub_download
# from langchain.embeddings import HuggingFaceEmbeddings
# from langchain.vectorstores import FAISS
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# from langchain.text_splitter import RecursiveCharacterTextSplitter
import json
from pathlib import Path
import uuid
import json
from datetime import datetime
from huggingface_hub import HfApi

# Get the data logs path
data_logs_dir = os.path.join(
    Path(os.getcwd()).parent,
    'data-logs'
)


session_id = str(uuid.uuid4())  # Unique ID per launch
log_path = f"logs/session_{session_id}.jsonl"
os.makedirs("logs", exist_ok=True)

def log_interaction_hf(user_message, assistant_response):
    log_entry = {
        "timestamp": datetime.utcnow().isoformat(),
        "user": user_message,
        "assistant": assistant_response
    }
    with open(log_path, "a", encoding="utf-8") as f:
        f.write(json.dumps(log_entry) + "\n")


def upload_log_to_hf(token):
    api = HfApi(token)
    # log_in_repo = f"logs/session_{session_id}.jsonl"
    # api.upload_file(
    #     path_or_fileobj=log_path,
    #     path_in_repo=log_in_repo,
    #     repo_id="kaburia/chat-logs-policy-analysis",
    #     repo_type="dataset"
    # )
    api.upload_folder(
        folder_path="logs",
        path_in_repo=f"session_{session_id}",
        repo_id="kaburia/chat-logs-policy-analysis",
        repo_type="dataset",
    )

# === Step 0: Download FAISS index files if not present ===

def download_faiss_index(repo_id="kaburia/epic-a-embeddings", local_folder="faiss_index"):
    os.makedirs(local_folder, exist_ok=True)

    index_faiss_path = os.path.join(local_folder, "index.faiss")
    index_pkl_path = os.path.join(local_folder, "index.pkl")

    if not os.path.exists(index_faiss_path):
        print("Downloading index.faiss from Hugging Face Dataset...")
        hf_hub_download(
            repo_id=repo_id,
            filename="index.faiss",
            repo_type="dataset",          # 🛑 MUST add this line
            local_dir=local_folder,
            local_dir_use_symlinks=False,
        )

    if not os.path.exists(index_pkl_path):
        print("Downloading index.pkl from Hugging Face Dataset...")
        hf_hub_download(
            repo_id=repo_id,
            filename="index.pkl",
            repo_type="dataset",          # 🛑 MUST add this line
            local_dir=local_folder,
            local_dir_use_symlinks=False,
        )

# Download FAISS index if needed
download_faiss_index()


# === Step 1: Load Vectorstore ===

def load_vectorstore(index_path="faiss_index"):
    embedding_model = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )
    db = FAISS.load_local(
        index_path,
        embeddings=embedding_model,
        allow_dangerous_deserialization=True
    )
    return db

vectorstore = load_vectorstore()

def cosine_to_prob(score):
    # Convert cosine similarity from [-1, 1] to [0, 1]
    return (score + 1) / 2

def retrieve_context(question, p=5, threshold=0.5):
    # Get docs with raw scores
    results = vectorstore.similarity_search_with_score(question, k=50)  # get more than needed

    # Filter for "probability" above threshold
    filtered = [(doc, score) for doc, score in results if cosine_to_prob(score) > threshold]

    # Sort by score descending and take top-p
    top_p_docs = sorted(filtered, key=lambda x: x[1], reverse=True)[:p]

    # Join content for prompt
    context = "\n\n".join(doc.page_content for doc, _ in top_p_docs)
    return context