medextract / app /utils /icd_utils.py
harsh-dev's picture
Add
ec563fd
import numpy as np
import time
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import pandas as pd
from app.config import settings
def get_ICD_Code(query: str, threshold: float = 0.5):
df = pd.read_pickle(settings.ICD_DATA_PATH)
model = SentenceTransformer('all-MiniLM-L6-v2')
start_time = time.time()
# Ensure embeddings are in proper format (numpy arrays)
dataset_embeddings = np.vstack(df['encoded'].values)
query_embedding = model.encode([query], normalize_embeddings=True)
# Compute cosine similarity
similarities = cosine_similarity(query_embedding, dataset_embeddings)[0]
# Find the most similar index
most_similar_index = np.argmax(similarities)
# Print result
print(f"Most similar sentence: \"{df.iloc[most_similar_index]['Description']}\" with similarity score: {similarities[most_similar_index]}")
end_time = time.time()
print(f"Execution time: {end_time - start_time} seconds")
print(f"Execution time: {time.time() - start_time:.4f} seconds")
return df.iloc[most_similar_index, 0] # Assuming column 0 is the ICD code
def map_records_with_icd(records: list):
for record in records:
record["ID"] = get_ICD_Code(record.get("TestName", "")) or "UNKNOWN"
return records