Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .gradio/certificate.pem +31 -0
- README.md +3 -9
- app.py +93 -0
- pa_extraction_summary.csv +0 -0
- requirements.txt +3 -0
- runtime.txt +1 -0
.gradio/certificate.pem
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-----BEGIN CERTIFICATE-----
|
| 2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
| 3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
| 4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
| 5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
| 6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
| 7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
| 8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
| 9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
| 10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
| 11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
| 12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
| 13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
| 14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
| 15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
| 16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
| 17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
| 18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
| 19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
| 20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
| 21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
| 22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
| 23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
| 24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
| 25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
| 26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
| 27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
| 28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
| 29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
| 30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
| 31 |
+
-----END CERTIFICATE-----
|
README.md
CHANGED
|
@@ -1,12 +1,6 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji: 🐠
|
| 4 |
-
colorFrom: red
|
| 5 |
-
colorTo: indigo
|
| 6 |
-
sdk: gradio
|
| 7 |
-
sdk_version: 5.49.1
|
| 8 |
app_file: app.py
|
| 9 |
-
|
|
|
|
| 10 |
---
|
| 11 |
-
|
| 12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: pa_ocr_hf_space
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
app_file: app.py
|
| 4 |
+
sdk: gradio
|
| 5 |
+
sdk_version: 5.47.2
|
| 6 |
---
|
|
|
|
|
|
app.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# app.py
|
| 3 |
+
import re
|
| 4 |
+
import io
|
| 5 |
+
import pdfplumber
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import gradio as gr
|
| 8 |
+
|
| 9 |
+
ICD10_RE = re.compile(r'\\b[A-Z][0-9]{2}(?:\\.[0-9A-Za-z]{1,4})?\\b')
|
| 10 |
+
CPT_RE = re.compile(r'\\b(?:CPT[:\\s]*)?([0-9]{5})\\b')
|
| 11 |
+
HCPCS_RE = re.compile(r'\\b(H[0-9]{4}|[A-Z][0-9]{4})\\b')
|
| 12 |
+
DATE_RE = re.compile(r'\\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|'
|
| 13 |
+
r'Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?|\\d{1,2}/\\d{1,2}/\\d{2,4})\\b',
|
| 14 |
+
flags=re.IGNORECASE)
|
| 15 |
+
PA_KEYWORDS = ['prior authorization','prior auth','pre-authorization','preauthorization','authorization required','prior approval']
|
| 16 |
+
|
| 17 |
+
def extract_text_from_pdf(file_obj):
|
| 18 |
+
text_pages = []
|
| 19 |
+
with pdfplumber.open(file_obj) as pdf:
|
| 20 |
+
for page in pdf.pages:
|
| 21 |
+
page_text = page.extract_text()
|
| 22 |
+
if page_text:
|
| 23 |
+
text_pages.append(page_text)
|
| 24 |
+
return "\\n\\n".join(text_pages)
|
| 25 |
+
|
| 26 |
+
def sentence_split(text):
|
| 27 |
+
sents = re.split(r'(?<=[\\.\\?\\:])\\s+(?=[A-Z0-9])', text.replace("\\n", " "))
|
| 28 |
+
return [s.strip() for s in sents if s.strip()]
|
| 29 |
+
|
| 30 |
+
def find_nearby_sentences(text, keyword, window=2):
|
| 31 |
+
sents = sentence_split(text)
|
| 32 |
+
hits = []
|
| 33 |
+
for i, s in enumerate(sents):
|
| 34 |
+
if keyword.lower() in s.lower():
|
| 35 |
+
start = max(0, i-window)
|
| 36 |
+
end = min(len(sents), i+window+1)
|
| 37 |
+
hits.append(" ".join(sents[start:end]))
|
| 38 |
+
return hits
|
| 39 |
+
|
| 40 |
+
def extract_pa_insights(text):
|
| 41 |
+
icd10 = sorted(set(ICD10_RE.findall(text)))
|
| 42 |
+
cpt = sorted(set(m for m in CPT_RE.findall(text)))
|
| 43 |
+
hcpcs = sorted(set(HCPCS_RE.findall(text)))
|
| 44 |
+
dates = sorted(set(DATE_RE.findall(text)))
|
| 45 |
+
pa_mentions = []
|
| 46 |
+
for kw in PA_KEYWORDS:
|
| 47 |
+
pa_mentions.extend(find_nearby_sentences(text, kw, window=2))
|
| 48 |
+
pa_mentions = list(dict.fromkeys(pa_mentions))
|
| 49 |
+
headings=[]
|
| 50 |
+
for line in text.splitlines():
|
| 51 |
+
line=line.strip()
|
| 52 |
+
if not line: continue
|
| 53 |
+
if line.isupper() and len(line)<120: headings.append(line)
|
| 54 |
+
elif line.endswith(':') and len(line)<120: headings.append(line)
|
| 55 |
+
rows=[
|
| 56 |
+
("ICD-10 codes", ", ".join(icd10) or "None"),
|
| 57 |
+
("CPT codes", ", ".join(cpt) or "None"),
|
| 58 |
+
("HCPCS codes", ", ".join(hcpcs) or "None"),
|
| 59 |
+
("Dates found", ", ".join(dates[:10]) or "None"),
|
| 60 |
+
("PA snippets", " ||| ".join(pa_mentions[:10]) or "None"),
|
| 61 |
+
("Headings", " ||| ".join(headings[:10]) or "None")
|
| 62 |
+
]
|
| 63 |
+
df=pd.DataFrame(rows, columns=["Field","Extracted"])
|
| 64 |
+
return df, pa_mentions
|
| 65 |
+
|
| 66 |
+
def ocr_and_extract(pdf_file):
|
| 67 |
+
if pdf_file is None:
|
| 68 |
+
return "Please upload a PDF.", None, None, None
|
| 69 |
+
try:
|
| 70 |
+
with open(pdf_file.name,"rb") as f:
|
| 71 |
+
file_bytes=f.read()
|
| 72 |
+
text=extract_text_from_pdf(io.BytesIO(file_bytes))
|
| 73 |
+
if not text.strip():
|
| 74 |
+
return "No selectable text found.", None, None, None
|
| 75 |
+
df, snippets=extract_pa_insights(text)
|
| 76 |
+
out_csv="pa_extraction_summary.csv"
|
| 77 |
+
df.to_csv(out_csv,index=False)
|
| 78 |
+
preview="\\n\\n".join(snippets[:5]) if snippets else "No PA snippets found."
|
| 79 |
+
return "Extraction successful.", df, preview, out_csv
|
| 80 |
+
except Exception as e:
|
| 81 |
+
return f"Error: {e}", None, None, None
|
| 82 |
+
|
| 83 |
+
with gr.Blocks(title="PA OCR & CMS Extractor") as demo:
|
| 84 |
+
gr.Markdown("# Prior Authorization – CMS PDF Extractor")
|
| 85 |
+
pdf_in=gr.File(label="Upload PDF")
|
| 86 |
+
run_btn=gr.Button("Run Extraction")
|
| 87 |
+
status = gr.Textbox(label="Status")
|
| 88 |
+
table = gr.Dataframe(row_count=10, col_count=5)
|
| 89 |
+
preview = gr.Textbox(label="PA Snippet Preview", lines=6)
|
| 90 |
+
download = gr.File(label="Download CSV")
|
| 91 |
+
run_btn.click(fn=ocr_and_extract, inputs=[pdf_in], outputs=[status, table, preview, download])
|
| 92 |
+
|
| 93 |
+
demo.launch(share=True)
|
pa_extraction_summary.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==4.29.0
|
| 2 |
+
pdfplumber
|
| 3 |
+
pandas
|
runtime.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
python-3.10
|