import openai import numpy as np import re from typing import List, Tuple from config import EMBED_MODEL def get_embedding(text: str) -> List[float]: """Generate embedding for a given text.""" text_strip = text.replace("\n", " ").strip() response = openai.embeddings.create(input=[text_strip], model=EMBED_MODEL) return response.data[0].embedding def cosine_similarity(a: List[float], b: List[float]) -> float: """Calculate cosine similarity between two vectors.""" a = np.array(a) b = np.array(b) if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0: return 0.0 return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) def clean_time(time_str: str) -> str: """Clean up time string.""" if not time_str: return "" time_match = re.search(r'(\d{1,2}):?(\d{0,2})\s*(AM|PM)', time_str, re.IGNORECASE) if time_match: hour = time_match.group(1) minute = time_match.group(2) or "00" ampm = time_match.group(3).upper() return f"{hour}:{minute} {ampm}" return time_str.strip() def find_top_k_matches(user_embedding, dataset, k=3): """Find top k matching entries from a dataset.""" scored = [] for entry_id, text, emb in dataset: score = cosine_similarity(user_embedding, emb) scored.append((score, entry_id, text)) scored.sort(reverse=True) return scored[:k]