Spaces:
Sleeping
Sleeping
| import numpy as np | |
| import time | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from sentence_transformers import SentenceTransformer | |
| import pandas as pd | |
| from app.config import settings | |
| def get_ICD_Code(query: str, threshold: float = 0.5): | |
| df = pd.read_pickle(settings.ICD_DATA_PATH) | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| start_time = time.time() | |
| # Ensure embeddings are in proper format (numpy arrays) | |
| dataset_embeddings = np.vstack(df['encoded'].values) | |
| query_embedding = model.encode([query], normalize_embeddings=True) | |
| # Compute cosine similarity | |
| similarities = cosine_similarity(query_embedding, dataset_embeddings)[0] | |
| # Find the most similar index | |
| most_similar_index = np.argmax(similarities) | |
| # Print result | |
| print(f"Most similar sentence: \"{df.iloc[most_similar_index]['Description']}\" with similarity score: {similarities[most_similar_index]}") | |
| end_time = time.time() | |
| print(f"Execution time: {end_time - start_time} seconds") | |
| print(f"Execution time: {time.time() - start_time:.4f} seconds") | |
| return df.iloc[most_similar_index, 0] # Assuming column 0 is the ICD code | |
| def map_records_with_icd(records: list): | |
| for record in records: | |
| record["ID"] = get_ICD_Code(record.get("TestName", "")) or "UNKNOWN" | |
| return records | |