Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -10,11 +10,11 @@ from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
|
|
| 10 |
|
| 11 |
# Configure Streamlit
|
| 12 |
st.set_page_config(
|
| 13 |
-
page_title="PDF Tools
|
| 14 |
layout="wide",
|
| 15 |
)
|
| 16 |
|
| 17 |
-
# Model
|
| 18 |
MODELS = {
|
| 19 |
"DeepSeek v3": {
|
| 20 |
"api_url": "https://api.deepseek.com/v1/chat/completions",
|
|
@@ -93,9 +93,9 @@ def clean_json_response(text):
|
|
| 93 |
if not text:
|
| 94 |
return None
|
| 95 |
original = text
|
| 96 |
-
#
|
| 97 |
text = re.sub(r'```(?:json)?', '', text).strip()
|
| 98 |
-
#
|
| 99 |
start = text.find('{')
|
| 100 |
end = text.rfind('}') + 1
|
| 101 |
if start < 0 or end < 1:
|
|
@@ -111,7 +111,6 @@ def clean_json_response(text):
|
|
| 111 |
return None
|
| 112 |
|
| 113 |
def get_extraction_prompt(model_choice, text):
|
| 114 |
-
# NOTE: every prompt below includes the word "json" in lowercase
|
| 115 |
if model_choice == "DeepSeek v3":
|
| 116 |
return (
|
| 117 |
"Extract complete invoice information and return ONLY a valid json object with these fields:\n"
|
|
@@ -121,12 +120,13 @@ def get_extraction_prompt(model_choice, text):
|
|
| 121 |
' "po_number": "string or null",\n'
|
| 122 |
' "invoice_value": "string with currency symbol",\n'
|
| 123 |
' "line_items": [\n'
|
| 124 |
-
" {
|
|
|
|
| 125 |
" ]\n"
|
| 126 |
"}\n"
|
| 127 |
"Rules:\n"
|
| 128 |
"1. Use null for missing fields\n"
|
| 129 |
-
"2. Do not include any
|
| 130 |
"Invoice Text:\n"
|
| 131 |
+ text
|
| 132 |
)
|
|
@@ -134,23 +134,43 @@ def get_extraction_prompt(model_choice, text):
|
|
| 134 |
elif model_choice == "DeepSeek R1":
|
| 135 |
return (
|
| 136 |
"Please extract invoice info from the text below and return only raw json:\n"
|
| 137 |
-
"{
|
|
|
|
|
|
|
|
|
|
| 138 |
"Invoice Text:\n"
|
| 139 |
+ text
|
| 140 |
)
|
| 141 |
|
| 142 |
-
else: # Llama
|
| 143 |
return (
|
| 144 |
-
"Extract
|
|
|
|
| 145 |
"{\n"
|
| 146 |
-
' "invoice_header": {
|
| 147 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
"}\n"
|
| 149 |
"Rules:\n"
|
| 150 |
-
"1.
|
| 151 |
-
"2.
|
| 152 |
-
"3. Currency values
|
| 153 |
-
"4.
|
|
|
|
| 154 |
"Invoice Text:\n"
|
| 155 |
+ text
|
| 156 |
)
|
|
@@ -169,7 +189,7 @@ def extract_invoice_info(model_choice, text):
|
|
| 169 |
if not data:
|
| 170 |
return None
|
| 171 |
|
| 172 |
-
# normalize
|
| 173 |
if model_choice in ["Llama 4 Mavericks", "Mistral Small"]:
|
| 174 |
hdr = data.setdefault("invoice_header", {})
|
| 175 |
for k in ["invoice_number", "invoice_date", "po_number", "invoice_value", "supplier_name", "customer_name"]:
|
|
@@ -192,7 +212,7 @@ def extract_invoice_info(model_choice, text):
|
|
| 192 |
tab1, tab2 = st.tabs(["PDF Summarizer", "Invoice Extractor"])
|
| 193 |
|
| 194 |
with tab1:
|
| 195 |
-
st.title("PDF to Bullet
|
| 196 |
pdf = st.file_uploader("Upload PDF", type="pdf")
|
| 197 |
pct = st.slider("Summarization (%)", 1, 100, 20)
|
| 198 |
if st.button("Summarize") and pdf:
|
|
@@ -200,9 +220,9 @@ with tab1:
|
|
| 200 |
keys = extract_key_phrases(txt)
|
| 201 |
scores = score_sentences(txt, keys)
|
| 202 |
n = max(1, len(scores) * pct // 100)
|
| 203 |
-
|
| 204 |
st.subheader("Summary")
|
| 205 |
-
st.markdown(
|
| 206 |
|
| 207 |
with tab2:
|
| 208 |
st.title("Invoice Extractor")
|
|
@@ -212,7 +232,7 @@ with tab2:
|
|
| 212 |
txt = read_pdf(io.BytesIO(inv_pdf.getvalue()))
|
| 213 |
info = extract_invoice_info(mdl, txt)
|
| 214 |
if info:
|
| 215 |
-
st.success("
|
| 216 |
if mdl in ["Llama 4 Mavericks", "Mistral Small"]:
|
| 217 |
h = info["invoice_header"]
|
| 218 |
c1, c2, c3 = st.columns(3)
|
|
|
|
| 10 |
|
| 11 |
# Configure Streamlit
|
| 12 |
st.set_page_config(
|
| 13 |
+
page_title="PDF Tools – Summarizer & Invoice Extractor",
|
| 14 |
layout="wide",
|
| 15 |
)
|
| 16 |
|
| 17 |
+
# Model configurations
|
| 18 |
MODELS = {
|
| 19 |
"DeepSeek v3": {
|
| 20 |
"api_url": "https://api.deepseek.com/v1/chat/completions",
|
|
|
|
| 93 |
if not text:
|
| 94 |
return None
|
| 95 |
original = text
|
| 96 |
+
# strip any ``` fences
|
| 97 |
text = re.sub(r'```(?:json)?', '', text).strip()
|
| 98 |
+
# locate outermost JSON braces
|
| 99 |
start = text.find('{')
|
| 100 |
end = text.rfind('}') + 1
|
| 101 |
if start < 0 or end < 1:
|
|
|
|
| 111 |
return None
|
| 112 |
|
| 113 |
def get_extraction_prompt(model_choice, text):
|
|
|
|
| 114 |
if model_choice == "DeepSeek v3":
|
| 115 |
return (
|
| 116 |
"Extract complete invoice information and return ONLY a valid json object with these fields:\n"
|
|
|
|
| 120 |
' "po_number": "string or null",\n'
|
| 121 |
' "invoice_value": "string with currency symbol",\n'
|
| 122 |
' "line_items": [\n'
|
| 123 |
+
" { \"description\": \"string\", \"quantity\": \"number or string\", "
|
| 124 |
+
"\"unit_price\": \"string with currency\", \"total_price\": \"string with currency\" }\n"
|
| 125 |
" ]\n"
|
| 126 |
"}\n"
|
| 127 |
"Rules:\n"
|
| 128 |
"1. Use null for missing fields\n"
|
| 129 |
+
"2. Do not include any extra text\n\n"
|
| 130 |
"Invoice Text:\n"
|
| 131 |
+ text
|
| 132 |
)
|
|
|
|
| 134 |
elif model_choice == "DeepSeek R1":
|
| 135 |
return (
|
| 136 |
"Please extract invoice info from the text below and return only raw json:\n"
|
| 137 |
+
"{ \"invoice_number\": \"string or null\", \"invoice_date\": \"YYYY-MM-DD or null\", "
|
| 138 |
+
"\"po_number\": \"string or null\", \"invoice_value\": \"string with currency or null\", "
|
| 139 |
+
"\"line_items\": [{ \"description\": \"string\", \"quantity\": \"number or string\", "
|
| 140 |
+
"\"unit_price\": \"string with currency\", \"total_price\": \"string with currency\" }] }\n"
|
| 141 |
"Invoice Text:\n"
|
| 142 |
+ text
|
| 143 |
)
|
| 144 |
|
| 145 |
+
else: # Llama & Mistral
|
| 146 |
return (
|
| 147 |
+
"You are given the text of an invoice. Extract the invoice information and return ONLY a valid json object "
|
| 148 |
+
"formatted exactly as below (nothing else):\n"
|
| 149 |
"{\n"
|
| 150 |
+
' "invoice_header": {\n'
|
| 151 |
+
' "invoice_number": "string",\n'
|
| 152 |
+
' "invoice_date": "YYYY-MM-DD",\n'
|
| 153 |
+
' "po_number": "string or null",\n'
|
| 154 |
+
' "invoice_value": "string with currency symbol",\n'
|
| 155 |
+
' "supplier_name": "string or null",\n'
|
| 156 |
+
' "customer_name": "string or null"\n'
|
| 157 |
+
' },\n'
|
| 158 |
+
' "line_items": [\n'
|
| 159 |
+
' {\n'
|
| 160 |
+
' "item_number": "string or null",\n'
|
| 161 |
+
' "description": "string",\n'
|
| 162 |
+
' "quantity": number,\n'
|
| 163 |
+
' "unit_price": "string with currency symbol",\n'
|
| 164 |
+
' "total_price": "string with currency symbol"\n'
|
| 165 |
+
' }\n'
|
| 166 |
+
' ]\n'
|
| 167 |
"}\n"
|
| 168 |
"Rules:\n"
|
| 169 |
+
"1. Date: YYYY-MM-DD\n"
|
| 170 |
+
"2. Use null for missing values\n"
|
| 171 |
+
"3. Currency values must include a symbol or code\n"
|
| 172 |
+
"4. No extra keys or explanatory text\n"
|
| 173 |
+
"5. Output must start with '{' and end with '}'\n\n"
|
| 174 |
"Invoice Text:\n"
|
| 175 |
+ text
|
| 176 |
)
|
|
|
|
| 189 |
if not data:
|
| 190 |
return None
|
| 191 |
|
| 192 |
+
# normalize fields
|
| 193 |
if model_choice in ["Llama 4 Mavericks", "Mistral Small"]:
|
| 194 |
hdr = data.setdefault("invoice_header", {})
|
| 195 |
for k in ["invoice_number", "invoice_date", "po_number", "invoice_value", "supplier_name", "customer_name"]:
|
|
|
|
| 212 |
tab1, tab2 = st.tabs(["PDF Summarizer", "Invoice Extractor"])
|
| 213 |
|
| 214 |
with tab1:
|
| 215 |
+
st.title("PDF to Bullet-Point Summarizer")
|
| 216 |
pdf = st.file_uploader("Upload PDF", type="pdf")
|
| 217 |
pct = st.slider("Summarization (%)", 1, 100, 20)
|
| 218 |
if st.button("Summarize") and pdf:
|
|
|
|
| 220 |
keys = extract_key_phrases(txt)
|
| 221 |
scores = score_sentences(txt, keys)
|
| 222 |
n = max(1, len(scores) * pct // 100)
|
| 223 |
+
summary = summarize_text(scores, num_points=n)
|
| 224 |
st.subheader("Summary")
|
| 225 |
+
st.markdown(summary)
|
| 226 |
|
| 227 |
with tab2:
|
| 228 |
st.title("Invoice Extractor")
|
|
|
|
| 232 |
txt = read_pdf(io.BytesIO(inv_pdf.getvalue()))
|
| 233 |
info = extract_invoice_info(mdl, txt)
|
| 234 |
if info:
|
| 235 |
+
st.success("Extraction Complete")
|
| 236 |
if mdl in ["Llama 4 Mavericks", "Mistral Small"]:
|
| 237 |
h = info["invoice_header"]
|
| 238 |
c1, c2, c3 = st.columns(3)
|