Spaces:
Build error
Build error
| from flask import Flask, jsonify, request, render_template | |
| import pandas as pd | |
| from flask_cors import CORS | |
| import os | |
| import json | |
| import torch | |
| from sentence_transformers import SentenceTransformer | |
| import logging | |
| from transformers import GPT2LMHeadModel, GPT2Tokenizer | |
| from datetime import datetime | |
| from transformers import BertTokenizer, BertForSequenceClassification | |
| import random | |
| import re | |
| app = Flask(__name__) | |
| app.json.sort_keys = False | |
| CORS(app) | |
| # Configure logging | |
| logging.basicConfig(level=logging.DEBUG) | |
| # Load the SentenceTransformer model | |
| model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model.to(device) | |
| print("---"*30) | |
| print(device) | |
| # Load tokenizer and model | |
| loaded_model = BertForSequenceClassification.from_pretrained('saved_model') | |
| loaded_tokenizer = BertTokenizer.from_pretrained('saved_model') | |
| tokenizer = GPT2Tokenizer.from_pretrained("checkpoint-15000") | |
| model_gpt = GPT2LMHeadModel.from_pretrained("checkpoint-15000") | |
| model_gpt.to(device) | |
| print("===="*20) | |
| # df_case = pd.read_csv('case_clustering24.csv', on_bad_lines="skip") | |
| grouped = pd.read_csv('grouped_22_23_24.csv', on_bad_lines="skip") | |
| from openai import OpenAI | |
| api_key = "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" | |
| #api_key = "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" | |
| client = OpenAI(api_key = api_key) | |
| def ask_gpt(question): | |
| # Using the text completion model | |
| response = client.chat.completions.create( | |
| model = "gpt-4o-mini", | |
| messages = [ | |
| {"role":"system","content":"you are a helpful assistant."}, | |
| {"role":"user","content": question} | |
| ] | |
| ) | |
| return response.choices[0].message.content | |
| class DataFrameManager: | |
| def __init__(self, file_path): | |
| self.file_path = file_path | |
| self.df = pd.DataFrame() # Initialize an empty DataFrame | |
| self.load_dataframe() | |
| def load_dataframe(self): | |
| if os.path.exists(self.file_path): | |
| constant_date = "2024-01-01" | |
| constant_policy_number = "POL123456" | |
| constant_status = "Active" | |
| self.df = pd.read_csv(self.file_path) | |
| self.df['date'] = constant_date | |
| self.df['policy_number'] = constant_policy_number | |
| self.df['status'] = constant_status | |
| self.df = self.df.rename(columns={ | |
| 'note_id': 'id', | |
| 'cleaned_comments': 'summary', | |
| 'summarized_text': 'suggested_summary' | |
| }) | |
| else: | |
| print(f"File not found: {self.file_path}") | |
| def get_dataframe(self): | |
| return self.df.copy().head(200) | |
| df_manager = DataFrameManager('client_notes_Sneha.csv') | |
| # Define a function for text generation | |
| def generate_text(prompt_text, max_length=100,num_return_sequences=10): | |
| # Tokenize the prompt text and convert to tensor | |
| input_ids = tokenizer(prompt_text, return_tensors="pt").input_ids.to(device) | |
| attention_mask = tokenizer( | |
| prompt_text, return_tensors="pt").attention_mask.to(device) | |
| print("........") | |
| try: | |
| # Move input_ids and attention_mask tensor to GPU | |
| input_ids = input_ids.to(device) | |
| attention_mask = attention_mask.to(device) | |
| outputs = model_gpt.generate( | |
| input_ids=input_ids, | |
| attention_mask=attention_mask, | |
| pad_token_id=tokenizer.pad_token_id, | |
| #max_length=10, | |
| max_new_tokens=3, | |
| num_beams=50, | |
| temperature=0.7, | |
| top_k=50, | |
| top_p=0.9, | |
| do_sample=True, | |
| num_return_sequences=num_return_sequences | |
| ) | |
| print(outputs) | |
| print(",,,") | |
| # Decode the generated text | |
| generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs] | |
| print(generated_texts) | |
| unique_texts = list(set(generated_texts)) | |
| return unique_texts[:5] | |
| except Exception as e: | |
| print(str(e)) | |
| def get_csv(): | |
| df = df_manager.get_dataframe() | |
| data = df.to_dict(orient='records') | |
| return jsonify(data) | |
| def search_notes(): | |
| request_data = request.get_json() | |
| claim_id_to_search = request_data.get('name', '') | |
| full_df = df_manager.get_dataframe() | |
| # print("DataFrame columns:", full_df.columns) # Debug output | |
| # print("DataFrame first few rows:", full_df.head()) # Debug output | |
| if claim_id_to_search: | |
| try: | |
| claim_id_to_search = int(claim_id_to_search) # Convert the claim ID from string to integer | |
| # print("Searching for ID:", claim_id_to_search) # Debug output | |
| filtered_df = full_df[full_df['id'] == claim_id_to_search] | |
| # print("Filtered DataFrame:", filtered_df) # Debug output | |
| if filtered_df.empty: | |
| print("No matching records found, returning full DataFrame.") | |
| return jsonify({}) | |
| else: | |
| return jsonify(filtered_df.to_dict(orient='records')) | |
| except ValueError as e: | |
| print(e) | |
| return jsonify({"error": "Invalid claim ID format"}), 400 | |
| else: | |
| print("No claim ID provided, returning full DataFrame.") | |
| return jsonify({}) | |
| def get_similarity(): | |
| data = request.json | |
| logging.debug(f"Received payload: {data}") | |
| if not data or 'id' not in data: | |
| return jsonify({'error': 'No valid data provided'}), 400 | |
| note_id = data['id'] | |
| logging.debug(f"Note ID: {note_id}") | |
| df = df_manager.get_dataframe() | |
| print(df.columns) | |
| filtered_df = df[df['id'] == note_id] | |
| if filtered_df.empty: | |
| return jsonify({'error': 'No matching record found'}), 404 | |
| row = filtered_df.iloc[0] | |
| summarized_text = row['suggested_summary'] | |
| logging.debug(f"Summarized Text: {summarized_text}") | |
| # Encode the target summarized_text | |
| target_embedding = model.encode(summarized_text, convert_to_tensor=True, device=device).unsqueeze(0) | |
| # Calculate similarities with all entries in the suggested_summary column | |
| similarities = [] | |
| for index, row in df.iterrows(): | |
| text = row['suggested_summary'] | |
| embedding = model.encode(text, convert_to_tensor=True, device=device).unsqueeze(0) | |
| similarity = torch.nn.functional.cosine_similarity(target_embedding, embedding).item() | |
| similarities.append({ | |
| 'id': row['id'], | |
| 'status': row['status'], | |
| 'policy_number': row['policy_number'], | |
| 'date': row['date_created'].split(' ')[0], | |
| 'summary': row['summary'], | |
| 'suggested_summary': text, | |
| 'similarity': similarity | |
| }) | |
| # Convert the results to a dataframe | |
| similarity_df = pd.DataFrame(similarities) | |
| # Sort the dataframe by similarity in descending order | |
| similarity_df = similarity_df.sort_values(by='similarity', ascending=False) | |
| print(similarity_df.head()) | |
| print(similarity_df.columns) | |
| # Convert the dataframe to a dictionary | |
| result = similarity_df.to_dict(orient='records') | |
| return jsonify(result), 200 | |
| def auto_complete(): | |
| data = request.args | |
| print("-----------------------") | |
| print(data) | |
| prompt = data.get('prompt', '') | |
| print(prompt) | |
| if not prompt: | |
| return jsonify({"error":"No prompt Provided"}),400 | |
| try: | |
| print("====") | |
| generated_texts = generate_text(prompt) | |
| print(generated_texts) | |
| return jsonify({'generated_text': generated_texts}) | |
| except Exception as e: | |
| return jsonify({"error":str(e)}),500 | |
| def classify_claim(): | |
| data = request.json | |
| case_id = int(data['case_id']) | |
| # claim_line_id = data['claim_line_id'] | |
| diagnosis = data['diagnosis'] | |
| claim_line_note = data['claim_line_notes'] | |
| print(data['service_date']) | |
| service_date = datetime.strptime(data['service_date'], "%Y-%m-%d") | |
| print("++++++"*30) | |
| print("dddddddd") | |
| # Convert all `case_id` values in `grouped` to integers | |
| grouped['case_id'] = grouped['case_id'].astype(int) | |
| record = grouped[grouped['case_id'] == case_id] | |
| # Check if case_id is present | |
| if case_id not in grouped['case_id'].values: | |
| new_case_id = random.randint(100000, 999999) | |
| return jsonify({"message": f"New Case: Customer id {case_id} not found. \n" | |
| f"A new case has been created with Case ID: {new_case_id}." }) | |
| # Check if the record is empty and return an appropriate response | |
| if record.empty: | |
| return jsonify({"error": "No record found for the given case_id"}), 404 | |
| print("--"*2) | |
| print(record['service_date']) | |
| # Compare service_date | |
| existing_service_date = datetime.strptime(eval(record['service_date'].values[0])[-1], "%Y-%m-%d") | |
| #existing_service_date = datetime.strptime(eval(record['service_date'].values[0])[-1],'%d-%m-%Y') | |
| print("-") | |
| print(existing_service_date) | |
| print("-") | |
| print(service_date) | |
| # | |
| is_recent = (service_date - existing_service_date).days < 90 | |
| print(is_recent) | |
| if case_id in grouped['case_id'].values and not is_recent: | |
| return jsonify({ | |
| "message": ( | |
| f"New case (Customer id {case_id} found, however service date is more than 90 days), " | |
| f"Last service date: {existing_service_date.strftime('%Y-%m-%d')}" | |
| ) | |
| }) | |
| # history for bert | |
| past_claims_data = {} | |
| for _, row in record.iterrows(): | |
| case_id = row['case_id'] # Extract the case_id for reference | |
| num_claims = len(row['service_date']) | |
| # Create sequences of claims within the same case | |
| for i in range(1, num_claims): | |
| row['claim_line_note'] = str([i for i in eval(row['claim_line_note'],{'nan':'nan'}) if i != 'nan']) | |
| input_sequence = ( | |
| f"Diagnosis History: {', '.join(map(str, eval(row['diagnosis'])))}, " | |
| f"Claim Line Notes History: {', '.join(map(str, eval(row['claim_line_note'])))}, " | |
| f"Service Dates History: {', '.join(map(str, eval(row['service_date'])))}") | |
| past_claims_data["input_sequence"]= input_sequence | |
| # history for llm | |
| past_claims_data_llm = {} | |
| for _, row in record.iterrows(): | |
| case_id = row['case_id'] # Extract the case_id for reference | |
| num_claims = len(row['service_date']) | |
| # Create sequences of claims within the same case | |
| for i in range(1, num_claims): | |
| row['claim_line_note'] = str([i for i in eval(row['claim_line_note'],{'nan':'nan'}) if i != 'nan']) | |
| past_claims_data_llm["Diagnosis History"]= ', '.join(map(str, eval(row['diagnosis']))) | |
| past_claims_data_llm["Claim Line Notes"]= ', '.join(map(str, eval(row['claim_line_note']))) | |
| print("***********************Past claim History***********************") | |
| print(past_claims_data['input_sequence']) | |
| print() | |
| # new claim info | |
| new_claim = ( | |
| f"New Diagnosis: {', '.join(map(str, [diagnosis]))}, " | |
| f"New Claim Line Note: {', '.join(map(str, [claim_line_note]))}, " | |
| f"New Service Date: {', '.join(map(str, [service_date]))}" | |
| ) | |
| print("***********************New claim Data***********************") | |
| print(new_claim) | |
| print("***********************") | |
| # Tokenize the test data | |
| inputs = loaded_tokenizer(past_claims_data['input_sequence'], new_claim, padding=True, truncation=True, return_tensors="pt") | |
| # Get model predictions | |
| with torch.no_grad(): | |
| outputs = loaded_model(**inputs) | |
| predictions = torch.argmax(outputs.logits, dim=-1) | |
| bert_probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1) | |
| pred_label = predictions.tolist()[0] | |
| new_case_id = random.randint(100000, 999999) | |
| new_claim_id = random.randint(100000, 999999) | |
| # Generate final output based on prediction | |
| if pred_label == 1: | |
| # New Case: Generate a 6-digit random case ID | |
| final_output = ( | |
| f"New Claim ID: {new_claim_id}." | |
| f"New Case: A new case has been created with Case ID: {new_case_id}. " | |
| f"The diagnosis and claim notes indicate it's a New Case. " | |
| f"Diagnosis: {diagnosis}, Claim Line Note: {claim_line_note}, Service Date: {service_date.strftime('%Y-%m-%d')}." | |
| ) | |
| else: | |
| # Follow-up Case: Add reasoning | |
| final_output = ( | |
| f"New Claim ID: {new_claim_id}." | |
| f"Follow-up Case: The claim has been classified as a follow-up case for Case ID: {case_id}. " | |
| f"The diagnosis and claim notes indicate a follow-up claim, and the service date is within 30 days of the last service date." | |
| ) | |
| ## LLM | |
| system_prompt = """Respond to the human as helpfully and accurately as possible. You are an expert in analyzing medical claims. Your task is to compare the new claim with past claims and determine if the New Claim is a "Follow-up Claim" (related to an existing issue) or a "Different Claim" (a separate, unrelated issue). | |
| To make this determination, carefully analyze both the below diagnosis and the claim line notes for patterns, similarities, or differences. | |
| **Existing Claims:** | |
| - Diagnosis: "{past_claims_data_diagnosis}" | |
| - Claim Line Note: "{past_claims_data_claim_line_note}" | |
| **New Claim:** | |
| - Diagnosis: "{diagnosis}" | |
| - Claim Line Note: "{claim_line_note}" | |
| Use a json blob to output a confidence sore along with a reasoning. | |
| Valid "category" values: Follow-up Case, New Case | |
| Valid "confidence_score" values: 1-100 | |
| Provide only ONE action per $JSON_BLOB, as shown: | |
| ``` | |
| {{ | |
| "action": $CATEGORY_NAME | |
| "confidence_score": "$CONFIDENCE_SCORE", | |
| "reasoning": $REASONING | |
| }} | |
| ``` | |
| Begin! Reminder to ALWAYS respond with a valid json blob of a single action. | |
| Respond directly if appropriate. Format is Action:```$JSON_BLOB```""" | |
| system_prompt = system_prompt.format(diagnosis=diagnosis, claim_line_note=claim_line_note, | |
| past_claims_data_diagnosis=past_claims_data_llm["Diagnosis History"], | |
| past_claims_data_claim_line_note=past_claims_data_llm["Claim Line Notes"]) | |
| # Get the LLM's response | |
| llm_response = ask_gpt(system_prompt) | |
| print(llm_response) | |
| # Function to extract the JSON blob from the LLM response | |
| def extract_json_blobs(response): | |
| try: | |
| # Use regex to find all JSON blobs within the backticks | |
| matches = re.findall(r'\{.*?\}', response, re.DOTALL) | |
| # matches = re.findall(r'\{(?:[^{}]|(?R))*\}', response) | |
| json_blobs = [json.loads(match) for match in matches] | |
| for blob in json_blobs: | |
| if 'confidence_score' in blob: | |
| blob['confidence_score'] = float(blob['confidence_score']) / 100 | |
| return json_blobs | |
| except json.JSONDecodeError as e: | |
| print(f"Error in parsing JSON: {e}") | |
| return [] | |
| final_output_llm = extract_json_blobs(llm_response)[0] | |
| print(final_output_llm) | |
| # Assign weights to BERT and LLM responses (adjust as per requirement) | |
| json_confidence_follow_up = final_output_llm['confidence_score'] if final_output_llm['action'] == 'Follow-up Case' else 1-final_output_llm['confidence_score'] | |
| json_confidence_new_case = final_output_llm['confidence_score'] if final_output_llm['action'] == 'New Case' else 1-final_output_llm['confidence_score'] | |
| # Extract BERT probabilities | |
| bert_confidence_follow_up = bert_probabilities[0][0].item() | |
| bert_confidence_new_case = bert_probabilities[0][1].item() | |
| # (simple average) | |
| combined_confidence_follow_up = 0.35 * bert_confidence_follow_up + 0.65 * json_confidence_follow_up | |
| combined_confidence_new_case = 0.35 * bert_confidence_new_case + 0.65 * json_confidence_new_case | |
| final_prediction = "New Case" if combined_confidence_new_case > combined_confidence_follow_up else "Follow-Up Case" | |
| if final_prediction == "Follow-Up Case": | |
| # Construct the response with Markdown-style formatting | |
| return jsonify({ | |
| #"BERT Prediction": "New Case" if pred_label == 1 else "Follow-Up Case", | |
| #"LLM Prediction": final_output_llm["action"], | |
| "Ensembled model Prediction": final_prediction, | |
| "New Claim ID": new_claim_id, | |
| "Weighted Confidence Score": round(max(combined_confidence_follow_up, combined_confidence_new_case), 2), | |
| "Reasoning": final_output_llm['reasoning'] | |
| }) | |
| else: | |
| return jsonify({ | |
| # "BERT Prediction": "New Case" if pred_label == 1 else "Follow-Up Case", | |
| # "LLM Prediction": final_output_llm["action"], | |
| "Ensembled models Prediction": final_prediction, | |
| "New Claim ID": new_claim_id, | |
| "Weighted Confidence Score": round(max(combined_confidence_follow_up, combined_confidence_new_case), 2), | |
| "New Case ID": new_case_id, | |
| "Reasoning": final_output_llm['reasoning'] | |
| }) | |
| if __name__ == '__main__': | |
| app.run(host='0.0.0.0', port=5000, debug=True) |