Spaces:

pilayar
/

CredentialingPrototype

Running

File size: 7,224 Bytes

b259aa8
 
 
 
 
 
be9723a
 
ffe98fd
c030e17
ffe98fd
 
e25e91d
 
316ac19
ffe98fd
 
 
ce57399
5b21a8b
 
 
 
 
 
 
ce57399
 
 
 
ffe98fd
 
 
ce57399
 
 
 
 
 
 
 
 
 
ffe98fd
ce57399
 
 
 
ffe98fd
ce57399
ffe98fd
ce57399
 
ffe98fd
 
 
ce57399
53b8f0e
a67d7cf
ffe98fd
 
 
ce57399
 
 
ffe98fd
 
 
 
 
 
 
 
 
 
 
b259aa8
 
 
be9723a
b259aa8
 
 
be9723a
 
 
 
 
 
 
 
 
 
 
b259aa8
ffe98fd
 
 
 
 
0a410ad
ffe98fd
 
be9723a
b259aa8
 
ffe98fd
be9723a
b259aa8
 
 
 
 
 
 
 
 
 
ffe98fd
 
b259aa8
ffe98fd
b259aa8
47dbab1
be9723a
b259aa8
 
 
 
 
 
be9723a
ffe98fd
be9723a
 
 
b259aa8
be9723a
 
 
 
b259aa8
ffe98fd
be9723a
ffe98fd
be9723a
 
9efc99e
ffe98fd
 
 
 
 
 
 
 
 
9efc99e
ffe98fd
 
 
 
 
 
47dbab1
ffe98fd
 
 
 
 
 
 
 
b259aa8
be9723a

import streamlit as st
import os
from google import genai
from google.genai import types
from pydantic import BaseModel, Field
from PIL import Image
import requests
from datetime import datetime
import pandas as pd
from langchain_core.documents import Document
from langchain_community.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
# from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings

# --- 1. RAG CONFIGURATION & INGESTION ---
@st.cache_resource
def get_vector_db(file_path):
    # 1. Load the CSV
    # df = pd.read_csv(file_path)
    try:
        # utf-8-sig handles files with a 'BOM' (Byte Order Mark) often added by Excel
        df = pd.read_csv(file_path, encoding='utf-8-sig')
    except UnicodeDecodeError:
        # latin1 is a catch-all for many other Windows-based CSV exports
        df = pd.read_csv(file_path, encoding='latin1')
    
    # 2. Clean headers: Replace '\n' with space and strip whitespace
    # This turns "LICENSE\nNUMBER" into "LICENSE NUMBER"
    df.columns = [col.replace('\n', ' ').strip().upper() for col in df.columns]
    
    documents = []
    for _, row in df.iterrows():
        # Use the cleaned UPPERCASE headers to extract data
        name = row.get('NAME', 'Unknown')
        lic = row.get('LICENSE NUMBER', 'Unknown')
        action = row.get('TYPE OF ACTION', 'Unknown')
        date = row.get('DATE OF ACTION', 'Unknown')
        
        # Skip empty rows (like the "JANUARY 2026" header row in your file)
        if pd.isna(name) and pd.isna(lic):
            continue

        content = (
            f"Provider Name: {name}\n"
            f"License Number: {lic}\n"
            f"Action Taken: {action}\n"
            f"Effective Date: {date}"
        )
        
        metadata = {
            "license": str(lic),
            "provider_name": str(name)
        }
        documents.append(Document(page_content=content, metadata=metadata))

    # 3. Initialize Vector Store
    # embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vector_db = Chroma.from_documents(documents, embeddings)
    return vector_db

# Ensure this matches your filename on Hugging Face exactly
# alert_list_path = "alert_list.csv"

def check_yellow_flags(license_number, vector_db):
    """Searches for the license in the ingested RAG database."""
    # Perform similarity search
    results = vector_db.similarity_search(license_number, k=1)
    
    # Check if the result is actually a match (similarity search can return 'close' matches)
    if results and license_number in results[0].page_content:
        return results[0].page_content
    return None

# --- 2. SCHEMAS & HELPERS ---
class ProviderLicense(BaseModel):
    provider_name: str = Field(description="Full name of the healthcare provider")
    license_number: str = Field(description="The professional license number")
    npi_number: str = Field(description="The 10-digit National Provider Identifier (NPI)")
    state: str = Field(description="The state where the license was issued")
    expiration_date: str = Field(description="Format: YYYY-MM-DD")

def get_nppes_data(npi_number):
    url = "https://npiregistry.cms.hhs.gov/api/?version=2.1"
    params = {"number": npi_number}
    try:
        response = requests.get(url, params=params)
        if response.status_code == 200:
            data = response.json()
            return data["results"][0] if data.get("result_count", 0) > 0 else None
        return None
    except Exception as e:
        return None

# --- 3. PAGE SETUP & INITIALIZATION ---
st.set_page_config(page_title="AI Credentialing Assistant", layout="wide")
st.title("🩺 Provider Credentialing AI")

# Load the RAG Database (Point to your uploaded CSV)
alert_list_path = "alert_list.csv"
vdb = get_vector_db(alert_list_path)

api_key = os.environ.get("GEMINI_API_KEY")
client = genai.Client(api_key=api_key)

# --- 4. MAIN WORKFLOW ---
uploaded_file = st.sidebar.file_uploader("Upload Medical License", type=["jpg", "jpeg", "png", "pdf"])

if uploaded_file:
    col1, col2 = st.columns(2)
    image = Image.open(uploaded_file)
    
    with col1:
        st.subheader("Document Preview")
        st.image(image, use_container_width=True)

    with col2:
        st.subheader("AI Extraction & Risk Analysis")
        with st.spinner("Extracting & Verifying..."):
            try:
                # A. Extraction
                response = client.models.generate_content(
                    model="gemini-2.5-flash", 
                    contents=["Extract details from this license.", image],
                    config=types.GenerateContentConfig(
                        response_mime_type="application/json",
                        response_schema=ProviderLicense,
                    ),
                )
                data = response.parsed

                # B. Expiration Logic
                expiry = datetime.strptime(data.expiration_date, "%Y-%m-%d").date()
                is_active = expiry >= datetime.today().date()

                st.metric("Provider Name", data.provider_name)
                st.write(f"**License:** {data.license_number} ({data.state})")
                
                status_color = "green" if is_active else "red"
                st.markdown(f"**Status:** :{status_color}[{ 'Valid' if is_active else 'Expired'}] (Expires: {data.expiration_date})")

                # C. Federal Verification
                st.divider()
                st.subheader("Federal Registry (NPPES)")
                registry_data = get_nppes_data(data.npi_number)

                if registry_data:
                    basic = registry_data.get('basic', {})
                    name = basic.get('organization_name') if registry_data.get('enumeration_type') == 'NPI-2' else f"{basic.get('first_name')} {basic.get('last_name')}"
                    st.success(f"NPI Verified: {name}")
                else:
                    st.warning("NPI not found in Federal Registry")

                # D. NEW: RAG-based Yellow Flag Detection
                st.divider()
                st.subheader("⚠️ Risk Intelligence (RAG)")
                
                flag_context = check_yellow_flags(data.license_number, vdb)
                
                if flag_context:
                    st.error(f"YELLOW FLAG DETECTED for License {data.license_number}")
                    # Use Gemini to summarize the disciplinary action for the user
                    risk_summary = client.models.generate_content(
                        model="gemini-2.5-flash",
                        contents=[f"Based on this medical board record, summarize the risk in one sentence: {flag_context}"]
                    )
                    st.warning(risk_summary.text)
                    with st.expander("View Raw Alert Detail"):
                        st.text(flag_context)
                else:
                    st.success("No active flags found in the 2026 Medical Board Alert List.")

            except Exception as e:
                st.error(f"Processing Error: {e}")