| import os |
| import json |
| import pickle |
| import numpy as np |
| from openai import OpenAI |
|
|
| _current_dir = os.path.dirname(__file__) |
| _FAQS_FILE = os.path.join(_current_dir, "medical_faqs.jsonl") |
| _EMBED_CACHE_FILE = os.path.join(_current_dir, "medical_faqs_embeddings.pkl") |
| _EMBED_MODEL = "text-embedding-3-small" |
|
|
| client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) |
|
|
| FAQ_ENTRIES: dict[str, dict] = {} |
| FAQ_VECS: list[tuple[str, np.ndarray]] = [] |
|
|
|
|
| def initialize_faq_store(): |
| global FAQ_ENTRIES, FAQ_VECS |
| FAQ_ENTRIES.clear() |
| FAQ_VECS.clear() |
|
|
| cached_vecs: dict[str, np.ndarray] = {} |
| if os.path.exists(_EMBED_CACHE_FILE): |
| with open(_EMBED_CACHE_FILE, "rb") as f: |
| cached_entries, cached_vecs = pickle.load(f) |
| FAQ_ENTRIES.update(cached_entries) |
|
|
| with open(_FAQS_FILE, "r", encoding="utf-8") as f: |
| for line in f: |
| line = line.strip() |
| if not line: |
| continue |
| obj = json.loads(line) |
| FAQ_ENTRIES[obj["id"]] = obj |
|
|
| new_vecs: dict[str, np.ndarray] = {} |
| for fid, entry in FAQ_ENTRIES.items(): |
| if fid in cached_vecs: |
| vec = cached_vecs[fid] |
| else: |
| resp = client.embeddings.create(model=_EMBED_MODEL, input=entry["question"]) |
| vec = np.array(resp.data[0].embedding, dtype=np.float32) |
| new_vecs[fid] = vec |
| FAQ_VECS.append((fid, vec)) |
|
|
| all_vecs = {**cached_vecs, **new_vecs} |
| with open(_EMBED_CACHE_FILE, "wb") as f: |
| pickle.dump((FAQ_ENTRIES, all_vecs), f) |
|
|
| print(f"FAQ store initialised with {len(FAQ_ENTRIES)} entries") |
|
|