Seth0330 commited on
Commit
e7adc3a
·
verified ·
1 Parent(s): 835898b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -145
app.py CHANGED
@@ -4,34 +4,28 @@ import requests
4
  import json
5
  import re
6
  import os
7
- from datetime import datetime
8
 
9
  from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
10
 
11
- # Configure Streamlit
12
- st.set_page_config(
13
- page_title="PDF Tools – Summarizer & Invoice Extractor",
14
- layout="wide",
15
- )
16
 
17
- # Model configurations
18
  MODELS = {
19
  "DeepSeek v3": {
20
  "api_url": "https://api.deepseek.com/v1/chat/completions",
21
- "model_name": "deepseek-chat",
22
- "api_key_env": "DEEPSEEK_API_KEY",
23
  "response_format": {"type": "json_object"},
24
  },
25
  "DeepSeek R1": {
26
  "api_url": "https://api.deepseek.com/v1/chat/completions",
27
- "model_name": "deepseek-reasoner",
28
- "api_key_env": "DEEPSEEK_API_KEY",
29
  "response_format": None,
30
  },
31
  "Llama 4 Mavericks": {
32
  "api_url": "https://openrouter.ai/api/v1/chat/completions",
33
- "model_name": "meta-llama/llama-4-maverick:free",
34
- "api_key_env": "OPENROUTER_API_KEY",
35
  "response_format": {"type": "json_object"},
36
  "extra_headers": {
37
  "HTTP-Referer": "https://huggingface.co",
@@ -40,8 +34,8 @@ MODELS = {
40
  },
41
  "Mistral Small": {
42
  "api_url": "https://openrouter.ai/api/v1/chat/completions",
43
- "model_name": "mistralai/mistral-small-3.1-24b-instruct:free",
44
- "api_key_env": "OPENROUTER_API_KEY",
45
  "response_format": {"type": "json_object"},
46
  "extra_headers": {
47
  "HTTP-Referer": "https://huggingface.co",
@@ -51,9 +45,9 @@ MODELS = {
51
  }
52
 
53
  def get_api_key(model_choice):
54
- key = os.environ.get(MODELS[model_choice]["api_key_env"])
55
  if not key:
56
- st.error(f"❌ {MODELS[model_choice]['api_key_env']} not set")
57
  st.stop()
58
  return key
59
 
@@ -65,44 +59,44 @@ def query_llm(model_choice, prompt):
65
  }
66
  if cfg.get("extra_headers"):
67
  headers.update(cfg["extra_headers"])
68
-
69
  payload = {
70
- "model": cfg["model_name"],
71
  "messages": [{"role": "user", "content": prompt}],
72
  "temperature": 0.1,
73
  "max_tokens": 2000,
74
  }
75
  if cfg.get("response_format"):
76
  payload["response_format"] = cfg["response_format"]
77
-
78
  try:
79
  with st.spinner(f"🔍 Querying {model_choice}..."):
80
- resp = requests.post(cfg["api_url"], headers=headers, json=payload, timeout=90)
81
- if resp.status_code != 200:
82
- st.error(f"🚨 API Error {resp.status_code}: {resp.text}")
83
  return None
84
- content = resp.json()["choices"][0]["message"]["content"]
85
- st.session_state.last_api_response = content
86
- st.session_state.last_api_raw = resp.text
87
  return content
88
  except Exception as e:
89
- st.error(f"Connection failed: {e}")
90
  return None
91
 
92
  def clean_json_response(text):
93
  if not text:
94
  return None
95
- original = text
96
- # strip any ``` fences
97
  text = re.sub(r'```(?:json)?', '', text).strip()
98
- # locate outermost JSON braces
99
  start = text.find('{')
100
  end = text.rfind('}') + 1
101
  if start < 0 or end < 1:
102
- st.error("Couldn't locate JSON in response.")
103
- st.code(original)
104
  return None
105
  fragment = text[start:end]
 
 
106
  try:
107
  return json.loads(fragment)
108
  except json.JSONDecodeError as e:
@@ -110,152 +104,101 @@ def clean_json_response(text):
110
  st.code(fragment)
111
  return None
112
 
113
- def get_extraction_prompt(model_choice, text):
114
- if model_choice == "DeepSeek v3":
115
- return (
116
- "Extract complete invoice information and return ONLY a valid json object with these fields:\n"
117
- "{\n"
118
- ' "invoice_number": "string",\n'
119
- ' "invoice_date": "YYYY-MM-DD",\n'
120
- ' "po_number": "string or null",\n'
121
- ' "invoice_value": "string with currency symbol",\n'
122
- ' "line_items": [\n'
123
- " { \"description\": \"string\", \"quantity\": \"number or string\", "
124
- "\"unit_price\": \"string with currency\", \"total_price\": \"string with currency\" }\n"
125
- " ]\n"
126
- "}\n"
127
- "Rules:\n"
128
- "1. Use null for missing fields\n"
129
- "2. Do not include any extra text\n\n"
130
- "Invoice Text:\n"
131
- + text
132
- )
133
 
134
- elif model_choice == "DeepSeek R1":
 
 
135
  return (
136
- "Please extract invoice info from the text below and return only raw json:\n"
137
- "{ \"invoice_number\": \"string or null\", \"invoice_date\": \"YYYY-MM-DD or null\", "
138
- "\"po_number\": \"string or null\", \"invoice_value\": \"string with currency or null\", "
139
- "\"line_items\": [{ \"description\": \"string\", \"quantity\": \"number or string\", "
140
- "\"unit_price\": \"string with currency\", \"total_price\": \"string with currency\" }] }\n"
141
- "Invoice Text:\n"
142
- + text
143
  )
144
-
145
- else: # Llama & Mistral
146
  return (
147
- "You are given the text of an invoice. Extract the invoice information and return ONLY a valid json object "
148
- "formatted exactly as below (nothing else):\n"
149
- "{\n"
150
- ' "invoice_header": {\n'
151
- ' "invoice_number": "string",\n'
152
- ' "invoice_date": "YYYY-MM-DD",\n'
153
- ' "po_number": "string or null",\n'
154
- ' "invoice_value": "string with currency symbol",\n'
155
- ' "supplier_name": "string or null",\n'
156
- ' "customer_name": "string or null"\n'
157
- ' },\n'
158
- ' "line_items": [\n'
159
- ' {\n'
160
- ' "item_number": "string or null",\n'
161
- ' "description": "string",\n'
162
- ' "quantity": number,\n'
163
- ' "unit_price": "string with currency symbol",\n'
164
- ' "total_price": "string with currency symbol"\n'
165
- ' }\n'
166
- ' ]\n'
167
- "}\n"
168
- "Rules:\n"
169
- "1. Date: YYYY-MM-DD\n"
170
- "2. Use null for missing values\n"
171
- "3. Currency values must include a symbol or code\n"
172
- "4. No extra keys or explanatory text\n"
173
- "5. Output must start with '{' and end with '}'\n\n"
174
- "Invoice Text:\n"
175
- + text
176
  )
177
 
178
  def extract_invoice_info(model_choice, text):
179
  prompt = get_extraction_prompt(model_choice, text)
180
  raw = query_llm(model_choice, prompt)
181
- if raw is None:
182
  return None
183
- if not raw.strip():
184
- st.error("Empty response from API.")
185
- st.code(st.session_state.last_api_raw)
186
- return None
187
-
188
  data = clean_json_response(raw)
189
  if not data:
190
  return None
191
 
192
- # normalize fields
193
- if model_choice in ["Llama 4 Mavericks", "Mistral Small"]:
194
- hdr = data.setdefault("invoice_header", {})
195
- for k in ["invoice_number", "invoice_date", "po_number", "invoice_value", "supplier_name", "customer_name"]:
196
- hdr.setdefault(k, None)
197
- items = data.setdefault("line_items", [])
 
 
198
  for itm in items:
199
- for k in ["item_number", "description", "quantity", "unit_price", "total_price"]:
200
- itm.setdefault(k, None)
201
  else:
202
- for k in ["invoice_number", "invoice_date", "po_number", "invoice_value"]:
203
- data.setdefault(k, None)
204
- items = data.setdefault("line_items", [])
205
  for itm in items:
206
- for k in ["description", "quantity", "unit_price", "total_price"]:
207
- itm.setdefault(k, None)
208
 
209
  return data
210
 
211
- # ---- UI ----
212
- tab1, tab2 = st.tabs(["PDF Summarizer", "Invoice Extractor"])
213
 
214
  with tab1:
215
- st.title("PDF to Bullet-Point Summarizer")
216
- pdf = st.file_uploader("Upload PDF", type="pdf")
217
- pct = st.slider("Summarization (%)", 1, 100, 20)
218
  if st.button("Summarize") and pdf:
219
  txt = read_pdf(io.BytesIO(pdf.getvalue()))
220
  keys = extract_key_phrases(txt)
221
- scores = score_sentences(txt, keys)
222
- n = max(1, len(scores) * pct // 100)
223
- summary = summarize_text(scores, num_points=n)
224
- st.subheader("Summary")
225
- st.markdown(summary)
226
 
227
  with tab2:
228
  st.title("Invoice Extractor")
229
- mdl = st.selectbox("Model", list(MODELS.keys()))
230
- inv_pdf = st.file_uploader("Invoice PDF", type="pdf")
231
  if st.button("Extract") and inv_pdf:
232
  txt = read_pdf(io.BytesIO(inv_pdf.getvalue()))
233
- info = extract_invoice_info(mdl, txt)
234
  if info:
235
- st.success("Extraction Complete")
236
- if mdl in ["Llama 4 Mavericks", "Mistral Small"]:
237
- h = info["invoice_header"]
238
- c1, c2, c3 = st.columns(3)
239
- c1.metric("Invoice #", h["invoice_number"])
240
- c1.metric("Supplier", h["supplier_name"])
241
- c2.metric("Date", h["invoice_date"])
242
- c2.metric("Customer", h["customer_name"])
243
- c3.metric("PO #", h["po_number"])
244
- c3.metric("Total", h["invoice_value"])
245
- st.subheader("Line Items")
246
  st.table(info["line_items"])
247
  else:
248
- c1, c2 = st.columns(2)
249
- c1.metric("Invoice #", info["invoice_number"])
250
- c1.metric("PO #", info["po_number"])
251
- c2.metric("Date", info["invoice_date"])
252
- c2.metric("Value", info["invoice_value"])
253
- st.subheader("Line Items")
254
  st.table(info["line_items"])
255
 
256
- if "last_api_response" in st.session_state:
257
  with st.expander("Debug"):
258
- st.write("Raw assistant content:")
259
- st.code(st.session_state.last_api_response)
260
- st.write("Full HTTP response:")
261
- st.code(st.session_state.last_api_raw)
 
4
  import json
5
  import re
6
  import os
 
7
 
8
  from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
9
 
10
+ st.set_page_config(page_title="PDF Tools", layout="wide")
 
 
 
 
11
 
 
12
  MODELS = {
13
  "DeepSeek v3": {
14
  "api_url": "https://api.deepseek.com/v1/chat/completions",
15
+ "model": "deepseek-chat",
16
+ "key_env": "DEEPSEEK_API_KEY",
17
  "response_format": {"type": "json_object"},
18
  },
19
  "DeepSeek R1": {
20
  "api_url": "https://api.deepseek.com/v1/chat/completions",
21
+ "model": "deepseek-reasoner",
22
+ "key_env": "DEEPSEEK_API_KEY",
23
  "response_format": None,
24
  },
25
  "Llama 4 Mavericks": {
26
  "api_url": "https://openrouter.ai/api/v1/chat/completions",
27
+ "model": "meta-llama/llama-4-maverick:free",
28
+ "key_env": "OPENROUTER_API_KEY",
29
  "response_format": {"type": "json_object"},
30
  "extra_headers": {
31
  "HTTP-Referer": "https://huggingface.co",
 
34
  },
35
  "Mistral Small": {
36
  "api_url": "https://openrouter.ai/api/v1/chat/completions",
37
+ "model": "mistralai/mistral-small-3.1-24b-instruct:free",
38
+ "key_env": "OPENROUTER_API_KEY",
39
  "response_format": {"type": "json_object"},
40
  "extra_headers": {
41
  "HTTP-Referer": "https://huggingface.co",
 
45
  }
46
 
47
  def get_api_key(model_choice):
48
+ key = os.getenv(MODELS[model_choice]["key_env"])
49
  if not key:
50
+ st.error(f"❌ {MODELS[model_choice]['key_env']} not set")
51
  st.stop()
52
  return key
53
 
 
59
  }
60
  if cfg.get("extra_headers"):
61
  headers.update(cfg["extra_headers"])
 
62
  payload = {
63
+ "model": cfg["model"],
64
  "messages": [{"role": "user", "content": prompt}],
65
  "temperature": 0.1,
66
  "max_tokens": 2000,
67
  }
68
  if cfg.get("response_format"):
69
  payload["response_format"] = cfg["response_format"]
 
70
  try:
71
  with st.spinner(f"🔍 Querying {model_choice}..."):
72
+ r = requests.post(cfg["api_url"], headers=headers, json=payload, timeout=90)
73
+ if r.status_code != 200:
74
+ st.error(f"🚨 API Error {r.status_code}: {r.text}")
75
  return None
76
+ content = r.json()["choices"][0]["message"]["content"]
77
+ st.session_state.last_api = content
78
+ st.session_state.last_raw = r.text
79
  return content
80
  except Exception as e:
81
+ st.error(f"Connection error: {e}")
82
  return None
83
 
84
  def clean_json_response(text):
85
  if not text:
86
  return None
87
+ orig = text
88
+ # strip fences
89
  text = re.sub(r'```(?:json)?', '', text).strip()
90
+ # grab braces
91
  start = text.find('{')
92
  end = text.rfind('}') + 1
93
  if start < 0 or end < 1:
94
+ st.error("Couldn't locate JSON")
95
+ st.code(orig)
96
  return None
97
  fragment = text[start:end]
98
+ # remove stray trailing commas before } or ]
99
+ fragment = re.sub(r',\s*([}\]])', r'\1', fragment)
100
  try:
101
  return json.loads(fragment)
102
  except json.JSONDecodeError as e:
 
104
  st.code(fragment)
105
  return None
106
 
107
+ def fallback_supplier(text):
108
+ # first non-empty line heuristic
109
+ lines = [l.strip() for l in text.splitlines() if l.strip()]
110
+ return lines[0] if lines else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
+ def get_extraction_prompt(model_choice, txt):
113
+ # every prompt now demands "json" and COMPACT JSON output
114
+ if model_choice.startswith("DeepSeek"):
115
  return (
116
+ "Extract full invoice info below and RETURN ONLY a valid json object (compact, single line) with these fields:\n"
117
+ '{"invoice_number":"string","invoice_date":"YYYY-MM-DD","po_number":"string|null",'
118
+ '"invoice_value":"string with currency","line_items":[{"description":"string","quantity":"number",'
119
+ '"unit_price":"string with currency","total_price":"string with currency"}]}\n'
120
+ "Use null for missing fields. NO extra text.\n\n"
121
+ f"Invoice Text:\n{txt}"
 
122
  )
123
+ else:
 
124
  return (
125
+ "You are given invoice text. Extract data and RETURN ONLY a compact json object (one line) exactly like this:\n"
126
+ '{"invoice_header":{"invoice_number":"string","invoice_date":"YYYY-MM-DD",'
127
+ '"po_number":"string|null","invoice_value":"string with currency",'
128
+ '"supplier_name":"string|null","customer_name":"string|null"},'
129
+ '"line_items":[{"item_number":"string|null","description":"string","quantity":number,'
130
+ '"unit_price":"string with currency","total_price":"string with currency"}]}\n'
131
+ "Use null for missing. NO explanations or extra keys.\n\n"
132
+ f"Invoice Text:\n{txt}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  )
134
 
135
  def extract_invoice_info(model_choice, text):
136
  prompt = get_extraction_prompt(model_choice, text)
137
  raw = query_llm(model_choice, prompt)
138
+ if not raw:
139
  return None
 
 
 
 
 
140
  data = clean_json_response(raw)
141
  if not data:
142
  return None
143
 
144
+ # normalize header + fallback supplier
145
+ if model_choice in ("Llama 4 Mavericks","Mistral Small"):
146
+ hdr = data.setdefault("invoice_header",{})
147
+ for k in ("invoice_number","invoice_date","po_number","invoice_value","supplier_name","customer_name"):
148
+ hdr.setdefault(k,None)
149
+ if not hdr.get("supplier_name"):
150
+ hdr["supplier_name"] = fallback_supplier(text)
151
+ items = data.setdefault("line_items",[])
152
  for itm in items:
153
+ for k in ("item_number","description","quantity","unit_price","total_price"):
154
+ itm.setdefault(k,None)
155
  else:
156
+ for k in ("invoice_number","invoice_date","po_number","invoice_value"):
157
+ data.setdefault(k,None)
158
+ items = data.setdefault("line_items",[])
159
  for itm in items:
160
+ for k in ("description","quantity","unit_price","total_price"):
161
+ itm.setdefault(k,None)
162
 
163
  return data
164
 
165
+ # UI
166
+ tab1, tab2 = st.tabs(["PDF Summarizer","Invoice Extractor"])
167
 
168
  with tab1:
169
+ st.title("PDF Bullet-Point Summarizer")
170
+ pdf = st.file_uploader("Upload PDF",type="pdf")
171
+ pct = st.slider("Summarization %",1,100,20)
172
  if st.button("Summarize") and pdf:
173
  txt = read_pdf(io.BytesIO(pdf.getvalue()))
174
  keys = extract_key_phrases(txt)
175
+ scores = score_sentences(txt,keys)
176
+ n = max(1, len(scores)*pct//100)
177
+ st.markdown(summarize_text(scores,num_points=n))
 
 
178
 
179
  with tab2:
180
  st.title("Invoice Extractor")
181
+ mdl = st.selectbox("Model",list(MODELS.keys()))
182
+ inv_pdf = st.file_uploader("Invoice PDF",type="pdf")
183
  if st.button("Extract") and inv_pdf:
184
  txt = read_pdf(io.BytesIO(inv_pdf.getvalue()))
185
+ info = extract_invoice_info(mdl,txt)
186
  if info:
187
+ st.success("Done")
188
+ if mdl in ("Llama 4 Mavericks","Mistral Small"):
189
+ h=info["invoice_header"]
190
+ c1,c2,c3=st.columns(3)
191
+ c1.metric("Invoice #",h["invoice_number"]);c1.metric("Supplier",h["supplier_name"])
192
+ c2.metric("Date",h["invoice_date"]);c2.metric("Customer",h["customer_name"])
193
+ c3.metric("PO #",h["po_number"]);c3.metric("Total",h["invoice_value"])
 
 
 
 
194
  st.table(info["line_items"])
195
  else:
196
+ c1,c2=st.columns(2)
197
+ c1.metric("Invoice #",info["invoice_number"]);c1.metric("PO #",info["po_number"])
198
+ c2.metric("Date",info["invoice_date"]);c2.metric("Value",info["invoice_value"])
 
 
 
199
  st.table(info["line_items"])
200
 
201
+ if "last_api" in st.session_state:
202
  with st.expander("Debug"):
203
+ st.code(st.session_state.last_api)
204
+ st.code(st.session_state.last_raw)