| import openai | |
| import numpy as np | |
| import re | |
| from typing import List, Tuple | |
| from config import EMBED_MODEL | |
| def get_embedding(text: str) -> List[float]: | |
| """Generate embedding for a given text.""" | |
| text_strip = text.replace("\n", " ").strip() | |
| response = openai.embeddings.create(input=[text_strip], model=EMBED_MODEL) | |
| return response.data[0].embedding | |
| def cosine_similarity(a: List[float], b: List[float]) -> float: | |
| """Calculate cosine similarity between two vectors.""" | |
| a = np.array(a) | |
| b = np.array(b) | |
| if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0: | |
| return 0.0 | |
| return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) | |
| def clean_time(time_str: str) -> str: | |
| """Clean up time string.""" | |
| if not time_str: | |
| return "" | |
| time_match = re.search(r'(\d{1,2}):?(\d{0,2})\s*(AM|PM)', time_str, re.IGNORECASE) | |
| if time_match: | |
| hour = time_match.group(1) | |
| minute = time_match.group(2) or "00" | |
| ampm = time_match.group(3).upper() | |
| return f"{hour}:{minute} {ampm}" | |
| return time_str.strip() | |
| def find_top_k_matches(user_embedding, dataset, k=3): | |
| """Find top k matching entries from a dataset.""" | |
| scored = [] | |
| for entry_id, text, emb in dataset: | |
| score = cosine_similarity(user_embedding, emb) | |
| scored.append((score, entry_id, text)) | |
| scored.sort(reverse=True) | |
| return scored[:k] |