Spaces:

LianHP
/

pa_ocr_hf_space

Sleeping

App Files Files Community

LianHP commited on Nov 16, 2025

Commit

79f880a

verified ·

1 Parent(s): 7f04f18

Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

.gradio/certificate.pem +31 -0
README.md +3 -9
app.py +93 -0
pa_extraction_summary.csv +0 -0
requirements.txt +3 -0
runtime.txt +1 -0

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: Pa Ocr Hf Space
-emoji: 🐠
-colorFrom: red
-colorTo: indigo
-sdk: gradio
-sdk_version: 5.49.1
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: pa_ocr_hf_space
 app_file: app.py
+sdk: gradio
+sdk_version: 5.47.2
 ---

app.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# app.py
+import re
+import io
+import pdfplumber
+import pandas as pd
+import gradio as gr
+ICD10_RE = re.compile(r'\\b[A-Z][0-9]{2}(?:\\.[0-9A-Za-z]{1,4})?\\b')
+CPT_RE = re.compile(r'\\b(?:CPT[:\\s]*)?([0-9]{5})\\b')
+HCPCS_RE = re.compile(r'\\b(H[0-9]{4}|[A-Z][0-9]{4})\\b')
+DATE_RE = re.compile(r'\\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|'
+                     r'Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?|\\d{1,2}/\\d{1,2}/\\d{2,4})\\b',
+                     flags=re.IGNORECASE)
+PA_KEYWORDS = ['prior authorization','prior auth','pre-authorization','preauthorization','authorization required','prior approval']
+def extract_text_from_pdf(file_obj):
+    text_pages = []
+    with pdfplumber.open(file_obj) as pdf:
+        for page in pdf.pages:
+            page_text = page.extract_text()
+            if page_text:
+                text_pages.append(page_text)
+    return "\\n\\n".join(text_pages)
+def sentence_split(text):
+    sents = re.split(r'(?<=[\\.\\?\\:])\\s+(?=[A-Z0-9])', text.replace("\\n", " "))
+    return [s.strip() for s in sents if s.strip()]
+def find_nearby_sentences(text, keyword, window=2):
+    sents = sentence_split(text)
+    hits = []
+    for i, s in enumerate(sents):
+        if keyword.lower() in s.lower():
+            start = max(0, i-window)
+            end = min(len(sents), i+window+1)
+            hits.append(" ".join(sents[start:end]))
+    return hits
+def extract_pa_insights(text):
+    icd10 = sorted(set(ICD10_RE.findall(text)))
+    cpt = sorted(set(m for m in CPT_RE.findall(text)))
+    hcpcs = sorted(set(HCPCS_RE.findall(text)))
+    dates = sorted(set(DATE_RE.findall(text)))
+    pa_mentions = []
+    for kw in PA_KEYWORDS:
+        pa_mentions.extend(find_nearby_sentences(text, kw, window=2))
+    pa_mentions = list(dict.fromkeys(pa_mentions))
+    headings=[]
+    for line in text.splitlines():
+        line=line.strip()
+        if not line: continue
+        if line.isupper() and len(line)<120: headings.append(line)
+        elif line.endswith(':') and len(line)<120: headings.append(line)
+    rows=[
+        ("ICD-10 codes", ", ".join(icd10) or "None"),
+        ("CPT codes", ", ".join(cpt) or "None"),
+        ("HCPCS codes", ", ".join(hcpcs) or "None"),
+        ("Dates found", ", ".join(dates[:10]) or "None"),
+        ("PA snippets", " ||| ".join(pa_mentions[:10]) or "None"),
+        ("Headings", " ||| ".join(headings[:10]) or "None")
+    ]
+    df=pd.DataFrame(rows, columns=["Field","Extracted"])
+    return df, pa_mentions
+def ocr_and_extract(pdf_file):
+    if pdf_file is None:
+        return "Please upload a PDF.", None, None, None
+    try:
+        with open(pdf_file.name,"rb") as f:
+            file_bytes=f.read()
+        text=extract_text_from_pdf(io.BytesIO(file_bytes))
+        if not text.strip():
+            return "No selectable text found.", None, None, None
+        df, snippets=extract_pa_insights(text)
+        out_csv="pa_extraction_summary.csv"
+        df.to_csv(out_csv,index=False)
+        preview="\\n\\n".join(snippets[:5]) if snippets else "No PA snippets found."
+        return "Extraction successful.", df, preview, out_csv
+    except Exception as e:
+        return f"Error: {e}", None, None, None
+with gr.Blocks(title="PA OCR & CMS Extractor") as demo:
+    gr.Markdown("# Prior Authorization – CMS PDF Extractor")
+    pdf_in=gr.File(label="Upload PDF")
+    run_btn=gr.Button("Run Extraction")
+    status = gr.Textbox(label="Status")
+    table = gr.Dataframe(row_count=10, col_count=5)
+    preview = gr.Textbox(label="PA Snippet Preview", lines=6)
+    download = gr.File(label="Download CSV")
+    run_btn.click(fn=ocr_and_extract, inputs=[pdf_in], outputs=[status, table, preview, download])
+demo.launch(share=True)

pa_extraction_summary.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio==4.29.0
+pdfplumber
+pandas

runtime.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python-3.10