import numpy as np import time from sklearn.metrics.pairwise import cosine_similarity from sentence_transformers import SentenceTransformer import pandas as pd from app.config import settings def get_ICD_Code(query: str, threshold: float = 0.5): df = pd.read_pickle(settings.ICD_DATA_PATH) model = SentenceTransformer('all-MiniLM-L6-v2') start_time = time.time() # Ensure embeddings are in proper format (numpy arrays) dataset_embeddings = np.vstack(df['encoded'].values) query_embedding = model.encode([query], normalize_embeddings=True) # Compute cosine similarity similarities = cosine_similarity(query_embedding, dataset_embeddings)[0] # Find the most similar index most_similar_index = np.argmax(similarities) # Print result print(f"Most similar sentence: \"{df.iloc[most_similar_index]['Description']}\" with similarity score: {similarities[most_similar_index]}") end_time = time.time() print(f"Execution time: {end_time - start_time} seconds") print(f"Execution time: {time.time() - start_time:.4f} seconds") return df.iloc[most_similar_index, 0] # Assuming column 0 is the ICD code def map_records_with_icd(records: list): for record in records: record["ID"] = get_ICD_Code(record.get("TestName", "")) or "UNKNOWN" return records