Seth0330 commited on
Commit
835898b
·
verified ·
1 Parent(s): 7e5da41

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -21
app.py CHANGED
@@ -10,11 +10,11 @@ from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
10
 
11
  # Configure Streamlit
12
  st.set_page_config(
13
- page_title="PDF Tools - Summarizer & Invoice Extractor",
14
  layout="wide",
15
  )
16
 
17
- # Model Configuration for Invoice Extractor
18
  MODELS = {
19
  "DeepSeek v3": {
20
  "api_url": "https://api.deepseek.com/v1/chat/completions",
@@ -93,9 +93,9 @@ def clean_json_response(text):
93
  if not text:
94
  return None
95
  original = text
96
- # remove any ``` fences
97
  text = re.sub(r'```(?:json)?', '', text).strip()
98
- # find outer braces
99
  start = text.find('{')
100
  end = text.rfind('}') + 1
101
  if start < 0 or end < 1:
@@ -111,7 +111,6 @@ def clean_json_response(text):
111
  return None
112
 
113
  def get_extraction_prompt(model_choice, text):
114
- # NOTE: every prompt below includes the word "json" in lowercase
115
  if model_choice == "DeepSeek v3":
116
  return (
117
  "Extract complete invoice information and return ONLY a valid json object with these fields:\n"
@@ -121,12 +120,13 @@ def get_extraction_prompt(model_choice, text):
121
  ' "po_number": "string or null",\n'
122
  ' "invoice_value": "string with currency symbol",\n'
123
  ' "line_items": [\n'
124
- " {...}\n"
 
125
  " ]\n"
126
  "}\n"
127
  "Rules:\n"
128
  "1. Use null for missing fields\n"
129
- "2. Do not include any additional text\n\n"
130
  "Invoice Text:\n"
131
  + text
132
  )
@@ -134,23 +134,43 @@ def get_extraction_prompt(model_choice, text):
134
  elif model_choice == "DeepSeek R1":
135
  return (
136
  "Please extract invoice info from the text below and return only raw json:\n"
137
- "{...}\n"
 
 
 
138
  "Invoice Text:\n"
139
  + text
140
  )
141
 
142
- else: # Llama / Mistral
143
  return (
144
- "Extract complete invoice information and return a valid json object with these fields:\n"
 
145
  "{\n"
146
- ' "invoice_header": {...},\n'
147
- ' "line_items": [...]\n'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  "}\n"
149
  "Rules:\n"
150
- "1. Return ONLY json\n"
151
- "2. Date format YYYY-MM-DD\n"
152
- "3. Currency values with symbol\n"
153
- "4. Do not include any explanations\n\n"
 
154
  "Invoice Text:\n"
155
  + text
156
  )
@@ -169,7 +189,7 @@ def extract_invoice_info(model_choice, text):
169
  if not data:
170
  return None
171
 
172
- # normalize
173
  if model_choice in ["Llama 4 Mavericks", "Mistral Small"]:
174
  hdr = data.setdefault("invoice_header", {})
175
  for k in ["invoice_number", "invoice_date", "po_number", "invoice_value", "supplier_name", "customer_name"]:
@@ -192,7 +212,7 @@ def extract_invoice_info(model_choice, text):
192
  tab1, tab2 = st.tabs(["PDF Summarizer", "Invoice Extractor"])
193
 
194
  with tab1:
195
- st.title("PDF to Bullet Point Summarizer")
196
  pdf = st.file_uploader("Upload PDF", type="pdf")
197
  pct = st.slider("Summarization (%)", 1, 100, 20)
198
  if st.button("Summarize") and pdf:
@@ -200,9 +220,9 @@ with tab1:
200
  keys = extract_key_phrases(txt)
201
  scores = score_sentences(txt, keys)
202
  n = max(1, len(scores) * pct // 100)
203
- bullet = summarize_text(scores, num_points=n)
204
  st.subheader("Summary")
205
- st.markdown(bullet)
206
 
207
  with tab2:
208
  st.title("Invoice Extractor")
@@ -212,7 +232,7 @@ with tab2:
212
  txt = read_pdf(io.BytesIO(inv_pdf.getvalue()))
213
  info = extract_invoice_info(mdl, txt)
214
  if info:
215
- st.success("Done")
216
  if mdl in ["Llama 4 Mavericks", "Mistral Small"]:
217
  h = info["invoice_header"]
218
  c1, c2, c3 = st.columns(3)
 
10
 
11
  # Configure Streamlit
12
  st.set_page_config(
13
+ page_title="PDF Tools Summarizer & Invoice Extractor",
14
  layout="wide",
15
  )
16
 
17
+ # Model configurations
18
  MODELS = {
19
  "DeepSeek v3": {
20
  "api_url": "https://api.deepseek.com/v1/chat/completions",
 
93
  if not text:
94
  return None
95
  original = text
96
+ # strip any ``` fences
97
  text = re.sub(r'```(?:json)?', '', text).strip()
98
+ # locate outermost JSON braces
99
  start = text.find('{')
100
  end = text.rfind('}') + 1
101
  if start < 0 or end < 1:
 
111
  return None
112
 
113
  def get_extraction_prompt(model_choice, text):
 
114
  if model_choice == "DeepSeek v3":
115
  return (
116
  "Extract complete invoice information and return ONLY a valid json object with these fields:\n"
 
120
  ' "po_number": "string or null",\n'
121
  ' "invoice_value": "string with currency symbol",\n'
122
  ' "line_items": [\n'
123
+ " { \"description\": \"string\", \"quantity\": \"number or string\", "
124
+ "\"unit_price\": \"string with currency\", \"total_price\": \"string with currency\" }\n"
125
  " ]\n"
126
  "}\n"
127
  "Rules:\n"
128
  "1. Use null for missing fields\n"
129
+ "2. Do not include any extra text\n\n"
130
  "Invoice Text:\n"
131
  + text
132
  )
 
134
  elif model_choice == "DeepSeek R1":
135
  return (
136
  "Please extract invoice info from the text below and return only raw json:\n"
137
+ "{ \"invoice_number\": \"string or null\", \"invoice_date\": \"YYYY-MM-DD or null\", "
138
+ "\"po_number\": \"string or null\", \"invoice_value\": \"string with currency or null\", "
139
+ "\"line_items\": [{ \"description\": \"string\", \"quantity\": \"number or string\", "
140
+ "\"unit_price\": \"string with currency\", \"total_price\": \"string with currency\" }] }\n"
141
  "Invoice Text:\n"
142
  + text
143
  )
144
 
145
+ else: # Llama & Mistral
146
  return (
147
+ "You are given the text of an invoice. Extract the invoice information and return ONLY a valid json object "
148
+ "formatted exactly as below (nothing else):\n"
149
  "{\n"
150
+ ' "invoice_header": {\n'
151
+ ' "invoice_number": "string",\n'
152
+ ' "invoice_date": "YYYY-MM-DD",\n'
153
+ ' "po_number": "string or null",\n'
154
+ ' "invoice_value": "string with currency symbol",\n'
155
+ ' "supplier_name": "string or null",\n'
156
+ ' "customer_name": "string or null"\n'
157
+ ' },\n'
158
+ ' "line_items": [\n'
159
+ ' {\n'
160
+ ' "item_number": "string or null",\n'
161
+ ' "description": "string",\n'
162
+ ' "quantity": number,\n'
163
+ ' "unit_price": "string with currency symbol",\n'
164
+ ' "total_price": "string with currency symbol"\n'
165
+ ' }\n'
166
+ ' ]\n'
167
  "}\n"
168
  "Rules:\n"
169
+ "1. Date: YYYY-MM-DD\n"
170
+ "2. Use null for missing values\n"
171
+ "3. Currency values must include a symbol or code\n"
172
+ "4. No extra keys or explanatory text\n"
173
+ "5. Output must start with '{' and end with '}'\n\n"
174
  "Invoice Text:\n"
175
  + text
176
  )
 
189
  if not data:
190
  return None
191
 
192
+ # normalize fields
193
  if model_choice in ["Llama 4 Mavericks", "Mistral Small"]:
194
  hdr = data.setdefault("invoice_header", {})
195
  for k in ["invoice_number", "invoice_date", "po_number", "invoice_value", "supplier_name", "customer_name"]:
 
212
  tab1, tab2 = st.tabs(["PDF Summarizer", "Invoice Extractor"])
213
 
214
  with tab1:
215
+ st.title("PDF to Bullet-Point Summarizer")
216
  pdf = st.file_uploader("Upload PDF", type="pdf")
217
  pct = st.slider("Summarization (%)", 1, 100, 20)
218
  if st.button("Summarize") and pdf:
 
220
  keys = extract_key_phrases(txt)
221
  scores = score_sentences(txt, keys)
222
  n = max(1, len(scores) * pct // 100)
223
+ summary = summarize_text(scores, num_points=n)
224
  st.subheader("Summary")
225
+ st.markdown(summary)
226
 
227
  with tab2:
228
  st.title("Invoice Extractor")
 
232
  txt = read_pdf(io.BytesIO(inv_pdf.getvalue()))
233
  info = extract_invoice_info(mdl, txt)
234
  if info:
235
+ st.success("Extraction Complete")
236
  if mdl in ["Llama 4 Mavericks", "Mistral Small"]:
237
  h = info["invoice_header"]
238
  c1, c2, c3 = st.columns(3)