ocrAPP / faq_store.py
hkai20000's picture
Update faq_store.py
ee0bd33 verified
import os
import json
import pickle
import numpy as np
from openai import OpenAI
_current_dir = os.path.dirname(__file__)
_FAQS_FILE = os.path.join(_current_dir, "medical_faqs.jsonl")
_EMBED_CACHE_FILE = os.path.join(_current_dir, "medical_faqs_embeddings.pkl")
_EMBED_MODEL = "text-embedding-3-small"
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
FAQ_ENTRIES: dict[str, dict] = {}
FAQ_VECS: list[tuple[str, np.ndarray]] = []
def initialize_faq_store():
global FAQ_ENTRIES, FAQ_VECS
FAQ_ENTRIES.clear()
FAQ_VECS.clear()
cached_vecs: dict[str, np.ndarray] = {}
if os.path.exists(_EMBED_CACHE_FILE):
with open(_EMBED_CACHE_FILE, "rb") as f:
cached_entries, cached_vecs = pickle.load(f)
FAQ_ENTRIES.update(cached_entries)
with open(_FAQS_FILE, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
obj = json.loads(line)
FAQ_ENTRIES[obj["id"]] = obj
new_vecs: dict[str, np.ndarray] = {}
for fid, entry in FAQ_ENTRIES.items():
if fid in cached_vecs:
vec = cached_vecs[fid]
else:
resp = client.embeddings.create(model=_EMBED_MODEL, input=entry["question"])
vec = np.array(resp.data[0].embedding, dtype=np.float32)
new_vecs[fid] = vec
FAQ_VECS.append((fid, vec))
all_vecs = {**cached_vecs, **new_vecs}
with open(_EMBED_CACHE_FILE, "wb") as f:
pickle.dump((FAQ_ENTRIES, all_vecs), f)
print(f"FAQ store initialised with {len(FAQ_ENTRIES)} entries")