Spaces:
Running
Running
File size: 7,224 Bytes
b259aa8 be9723a ffe98fd c030e17 ffe98fd e25e91d 316ac19 ffe98fd ce57399 5b21a8b ce57399 ffe98fd ce57399 ffe98fd ce57399 ffe98fd ce57399 ffe98fd ce57399 ffe98fd ce57399 53b8f0e a67d7cf ffe98fd ce57399 ffe98fd b259aa8 be9723a b259aa8 be9723a b259aa8 ffe98fd 0a410ad ffe98fd be9723a b259aa8 ffe98fd be9723a b259aa8 ffe98fd b259aa8 ffe98fd b259aa8 47dbab1 be9723a b259aa8 be9723a ffe98fd be9723a b259aa8 be9723a b259aa8 ffe98fd be9723a ffe98fd be9723a 9efc99e ffe98fd 9efc99e ffe98fd 47dbab1 ffe98fd b259aa8 be9723a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | import streamlit as st
import os
from google import genai
from google.genai import types
from pydantic import BaseModel, Field
from PIL import Image
import requests
from datetime import datetime
import pandas as pd
from langchain_core.documents import Document
from langchain_community.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
# from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
# --- 1. RAG CONFIGURATION & INGESTION ---
@st.cache_resource
def get_vector_db(file_path):
# 1. Load the CSV
# df = pd.read_csv(file_path)
try:
# utf-8-sig handles files with a 'BOM' (Byte Order Mark) often added by Excel
df = pd.read_csv(file_path, encoding='utf-8-sig')
except UnicodeDecodeError:
# latin1 is a catch-all for many other Windows-based CSV exports
df = pd.read_csv(file_path, encoding='latin1')
# 2. Clean headers: Replace '\n' with space and strip whitespace
# This turns "LICENSE\nNUMBER" into "LICENSE NUMBER"
df.columns = [col.replace('\n', ' ').strip().upper() for col in df.columns]
documents = []
for _, row in df.iterrows():
# Use the cleaned UPPERCASE headers to extract data
name = row.get('NAME', 'Unknown')
lic = row.get('LICENSE NUMBER', 'Unknown')
action = row.get('TYPE OF ACTION', 'Unknown')
date = row.get('DATE OF ACTION', 'Unknown')
# Skip empty rows (like the "JANUARY 2026" header row in your file)
if pd.isna(name) and pd.isna(lic):
continue
content = (
f"Provider Name: {name}\n"
f"License Number: {lic}\n"
f"Action Taken: {action}\n"
f"Effective Date: {date}"
)
metadata = {
"license": str(lic),
"provider_name": str(name)
}
documents.append(Document(page_content=content, metadata=metadata))
# 3. Initialize Vector Store
# embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vector_db = Chroma.from_documents(documents, embeddings)
return vector_db
# Ensure this matches your filename on Hugging Face exactly
# alert_list_path = "alert_list.csv"
def check_yellow_flags(license_number, vector_db):
"""Searches for the license in the ingested RAG database."""
# Perform similarity search
results = vector_db.similarity_search(license_number, k=1)
# Check if the result is actually a match (similarity search can return 'close' matches)
if results and license_number in results[0].page_content:
return results[0].page_content
return None
# --- 2. SCHEMAS & HELPERS ---
class ProviderLicense(BaseModel):
provider_name: str = Field(description="Full name of the healthcare provider")
license_number: str = Field(description="The professional license number")
npi_number: str = Field(description="The 10-digit National Provider Identifier (NPI)")
state: str = Field(description="The state where the license was issued")
expiration_date: str = Field(description="Format: YYYY-MM-DD")
def get_nppes_data(npi_number):
url = "https://npiregistry.cms.hhs.gov/api/?version=2.1"
params = {"number": npi_number}
try:
response = requests.get(url, params=params)
if response.status_code == 200:
data = response.json()
return data["results"][0] if data.get("result_count", 0) > 0 else None
return None
except Exception as e:
return None
# --- 3. PAGE SETUP & INITIALIZATION ---
st.set_page_config(page_title="AI Credentialing Assistant", layout="wide")
st.title("🩺 Provider Credentialing AI")
# Load the RAG Database (Point to your uploaded CSV)
alert_list_path = "alert_list.csv"
vdb = get_vector_db(alert_list_path)
api_key = os.environ.get("GEMINI_API_KEY")
client = genai.Client(api_key=api_key)
# --- 4. MAIN WORKFLOW ---
uploaded_file = st.sidebar.file_uploader("Upload Medical License", type=["jpg", "jpeg", "png", "pdf"])
if uploaded_file:
col1, col2 = st.columns(2)
image = Image.open(uploaded_file)
with col1:
st.subheader("Document Preview")
st.image(image, use_container_width=True)
with col2:
st.subheader("AI Extraction & Risk Analysis")
with st.spinner("Extracting & Verifying..."):
try:
# A. Extraction
response = client.models.generate_content(
model="gemini-2.5-flash",
contents=["Extract details from this license.", image],
config=types.GenerateContentConfig(
response_mime_type="application/json",
response_schema=ProviderLicense,
),
)
data = response.parsed
# B. Expiration Logic
expiry = datetime.strptime(data.expiration_date, "%Y-%m-%d").date()
is_active = expiry >= datetime.today().date()
st.metric("Provider Name", data.provider_name)
st.write(f"**License:** {data.license_number} ({data.state})")
status_color = "green" if is_active else "red"
st.markdown(f"**Status:** :{status_color}[{ 'Valid' if is_active else 'Expired'}] (Expires: {data.expiration_date})")
# C. Federal Verification
st.divider()
st.subheader("Federal Registry (NPPES)")
registry_data = get_nppes_data(data.npi_number)
if registry_data:
basic = registry_data.get('basic', {})
name = basic.get('organization_name') if registry_data.get('enumeration_type') == 'NPI-2' else f"{basic.get('first_name')} {basic.get('last_name')}"
st.success(f"NPI Verified: {name}")
else:
st.warning("NPI not found in Federal Registry")
# D. NEW: RAG-based Yellow Flag Detection
st.divider()
st.subheader("⚠️ Risk Intelligence (RAG)")
flag_context = check_yellow_flags(data.license_number, vdb)
if flag_context:
st.error(f"YELLOW FLAG DETECTED for License {data.license_number}")
# Use Gemini to summarize the disciplinary action for the user
risk_summary = client.models.generate_content(
model="gemini-2.5-flash",
contents=[f"Based on this medical board record, summarize the risk in one sentence: {flag_context}"]
)
st.warning(risk_summary.text)
with st.expander("View Raw Alert Detail"):
st.text(flag_context)
else:
st.success("No active flags found in the 2026 Medical Board Alert List.")
except Exception as e:
st.error(f"Processing Error: {e}") |