LianHP commited on
Commit
79f880a
·
verified ·
1 Parent(s): 7f04f18

Upload folder using huggingface_hub

Browse files
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Pa Ocr Hf Space
3
- emoji: 🐠
4
- colorFrom: red
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 5.49.1
8
  app_file: app.py
9
- pinned: false
 
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: pa_ocr_hf_space
 
 
 
 
 
3
  app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 5.47.2
6
  ---
 
 
app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # app.py
3
+ import re
4
+ import io
5
+ import pdfplumber
6
+ import pandas as pd
7
+ import gradio as gr
8
+
9
+ ICD10_RE = re.compile(r'\\b[A-Z][0-9]{2}(?:\\.[0-9A-Za-z]{1,4})?\\b')
10
+ CPT_RE = re.compile(r'\\b(?:CPT[:\\s]*)?([0-9]{5})\\b')
11
+ HCPCS_RE = re.compile(r'\\b(H[0-9]{4}|[A-Z][0-9]{4})\\b')
12
+ DATE_RE = re.compile(r'\\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|'
13
+ r'Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?|\\d{1,2}/\\d{1,2}/\\d{2,4})\\b',
14
+ flags=re.IGNORECASE)
15
+ PA_KEYWORDS = ['prior authorization','prior auth','pre-authorization','preauthorization','authorization required','prior approval']
16
+
17
+ def extract_text_from_pdf(file_obj):
18
+ text_pages = []
19
+ with pdfplumber.open(file_obj) as pdf:
20
+ for page in pdf.pages:
21
+ page_text = page.extract_text()
22
+ if page_text:
23
+ text_pages.append(page_text)
24
+ return "\\n\\n".join(text_pages)
25
+
26
+ def sentence_split(text):
27
+ sents = re.split(r'(?<=[\\.\\?\\:])\\s+(?=[A-Z0-9])', text.replace("\\n", " "))
28
+ return [s.strip() for s in sents if s.strip()]
29
+
30
+ def find_nearby_sentences(text, keyword, window=2):
31
+ sents = sentence_split(text)
32
+ hits = []
33
+ for i, s in enumerate(sents):
34
+ if keyword.lower() in s.lower():
35
+ start = max(0, i-window)
36
+ end = min(len(sents), i+window+1)
37
+ hits.append(" ".join(sents[start:end]))
38
+ return hits
39
+
40
+ def extract_pa_insights(text):
41
+ icd10 = sorted(set(ICD10_RE.findall(text)))
42
+ cpt = sorted(set(m for m in CPT_RE.findall(text)))
43
+ hcpcs = sorted(set(HCPCS_RE.findall(text)))
44
+ dates = sorted(set(DATE_RE.findall(text)))
45
+ pa_mentions = []
46
+ for kw in PA_KEYWORDS:
47
+ pa_mentions.extend(find_nearby_sentences(text, kw, window=2))
48
+ pa_mentions = list(dict.fromkeys(pa_mentions))
49
+ headings=[]
50
+ for line in text.splitlines():
51
+ line=line.strip()
52
+ if not line: continue
53
+ if line.isupper() and len(line)<120: headings.append(line)
54
+ elif line.endswith(':') and len(line)<120: headings.append(line)
55
+ rows=[
56
+ ("ICD-10 codes", ", ".join(icd10) or "None"),
57
+ ("CPT codes", ", ".join(cpt) or "None"),
58
+ ("HCPCS codes", ", ".join(hcpcs) or "None"),
59
+ ("Dates found", ", ".join(dates[:10]) or "None"),
60
+ ("PA snippets", " ||| ".join(pa_mentions[:10]) or "None"),
61
+ ("Headings", " ||| ".join(headings[:10]) or "None")
62
+ ]
63
+ df=pd.DataFrame(rows, columns=["Field","Extracted"])
64
+ return df, pa_mentions
65
+
66
+ def ocr_and_extract(pdf_file):
67
+ if pdf_file is None:
68
+ return "Please upload a PDF.", None, None, None
69
+ try:
70
+ with open(pdf_file.name,"rb") as f:
71
+ file_bytes=f.read()
72
+ text=extract_text_from_pdf(io.BytesIO(file_bytes))
73
+ if not text.strip():
74
+ return "No selectable text found.", None, None, None
75
+ df, snippets=extract_pa_insights(text)
76
+ out_csv="pa_extraction_summary.csv"
77
+ df.to_csv(out_csv,index=False)
78
+ preview="\\n\\n".join(snippets[:5]) if snippets else "No PA snippets found."
79
+ return "Extraction successful.", df, preview, out_csv
80
+ except Exception as e:
81
+ return f"Error: {e}", None, None, None
82
+
83
+ with gr.Blocks(title="PA OCR & CMS Extractor") as demo:
84
+ gr.Markdown("# Prior Authorization – CMS PDF Extractor")
85
+ pdf_in=gr.File(label="Upload PDF")
86
+ run_btn=gr.Button("Run Extraction")
87
+ status = gr.Textbox(label="Status")
88
+ table = gr.Dataframe(row_count=10, col_count=5)
89
+ preview = gr.Textbox(label="PA Snippet Preview", lines=6)
90
+ download = gr.File(label="Download CSV")
91
+ run_btn.click(fn=ocr_and_extract, inputs=[pdf_in], outputs=[status, table, preview, download])
92
+
93
+ demo.launch(share=True)
pa_extraction_summary.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio==4.29.0
2
+ pdfplumber
3
+ pandas
runtime.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python-3.10