Seth0330 commited on
Commit
0592d14
·
verified ·
1 Parent(s): 796b6f2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +236 -80
app.py CHANGED
@@ -1,98 +1,254 @@
1
  import streamlit as st
 
2
  import requests
3
- import time
 
4
  import os
 
 
 
 
 
5
 
6
- # CONFIG
7
- UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY", "pktmL5lfqlVv7IWW_MYhdXRl399GA1n8vaLktHefxVY")
8
- BASE_URL = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- def upload_pdf_to_unstract(pdf_file):
11
- url = f"{BASE_URL}/whisper"
 
 
 
 
 
 
 
12
  headers = {
13
- "unstract-key": UNSTRACT_API_KEY,
 
 
 
 
 
 
 
 
 
14
  }
15
- # Always reset file pointer
16
- pdf_file.seek(0)
17
- file_bytes = pdf_file.read()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- # Force correct .pdf extension and type
20
- file_name = getattr(pdf_file, "name", None)
21
- if not file_name or not file_name.lower().endswith(".pdf"):
22
- file_name = "invoice.pdf"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- files = {
25
- "file": (file_name, file_bytes, "application/pdf"),
26
- }
 
 
 
27
 
28
- # Debug print for troubleshooting
29
- # st.write("Uploading file with name:", file_name)
 
 
 
 
 
30
 
31
- with st.spinner("Uploading and starting OCR..."):
32
- resp = requests.post(url, headers=headers, files=files)
33
- if resp.status_code not in (200, 202):
34
- st.error(f"Upload failed: {resp.status_code}: {resp.text}")
35
  return None
36
- data = resp.json()
37
- whisper_hash = data.get("whisper_hash")
38
- if not whisper_hash:
39
- st.error(f"No whisper_hash in response: {data}")
40
- return whisper_hash
41
-
42
- def poll_until_processed(whisper_hash, poll_interval=3, max_attempts=30):
43
- status_url = f"{BASE_URL}/whisper-status?whisper_hash={whisper_hash}"
44
- headers = {
45
- "unstract-key": UNSTRACT_API_KEY,
46
- }
47
- with st.spinner("Processing PDF (OCR in progress)..."):
48
- for i in range(max_attempts):
49
- resp = requests.get(status_url, headers=headers)
50
- if resp.status_code != 200:
51
- st.error(f"Status check failed: {resp.status_code}: {resp.text}")
52
- return False
53
- status = resp.json().get("status")
54
- if status == "processed":
55
- return True
56
- elif status in ("failed", "error"):
57
- st.error(f"Processing failed: {resp.text}")
58
- return False
59
- time.sleep(poll_interval)
60
- st.error("Timed out waiting for OCR to complete.")
61
- return False
62
-
63
- def retrieve_text(whisper_hash):
64
- retrieve_url = f"{BASE_URL}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
65
- headers = {
66
- "unstract-key": UNSTRACT_API_KEY,
67
- }
68
- with st.spinner("Retrieving extracted text..."):
69
- resp = requests.get(retrieve_url, headers=headers)
70
- if resp.status_code != 200:
71
- st.error(f"Retrieve failed: {resp.status_code}: {resp.text}")
72
  return None
73
- data = resp.json()
74
- result_text = data.get("result_text", "")
75
- return result_text
76
 
77
- st.title("Unstract OCR: PDF Invoice Text Extraction")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- uploaded_pdf = st.file_uploader("Upload Invoice PDF", type="pdf")
 
 
80
 
81
- if st.button("Extract Text from PDF") and uploaded_pdf:
82
- whisper_hash = upload_pdf_to_unstract(uploaded_pdf)
83
- if not whisper_hash:
84
- st.stop()
85
- st.success(f"File accepted. Tracking hash: {whisper_hash}")
86
-
87
- if poll_until_processed(whisper_hash):
88
- text = retrieve_text(whisper_hash)
89
- if text:
90
- st.success("Text extraction complete!")
91
- st.subheader("Extracted Text:")
92
- st.text_area("Extracted Text", text, height=400)
93
- else:
94
- st.error("Extraction failed at retrieve step.")
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  else:
96
- st.error("OCR did not complete successfully.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
- st.caption("Powered by Unstract LLMWhisperer OCR API.")
 
 
 
 
1
  import streamlit as st
2
+ import io
3
  import requests
4
+ import json
5
+ import re
6
  import os
7
+ import time
8
+
9
+ from main import extract_key_phrases, score_sentences, summarize_text # read_pdf removed
10
+
11
+ st.set_page_config(page_title="PDF Tools", layout="wide")
12
 
13
+ # -------- LLM Model Setup (same as before) --------
14
+ MODELS = {
15
+ "DeepSeek v3": {
16
+ "api_url": "https://api.deepseek.com/v1/chat/completions",
17
+ "model": "deepseek-chat",
18
+ "key_env": "DEEPSEEK_API_KEY",
19
+ "response_format": {"type": "json_object"},
20
+ },
21
+ "DeepSeek R1": {
22
+ "api_url": "https://api.deepseek.com/v1/chat/completions",
23
+ "model": "deepseek-reasoner",
24
+ "key_env": "DEEPSEEK_API_KEY",
25
+ "response_format": None,
26
+ },
27
+ "OpenAI GPT-4.1": {
28
+ "api_url": "https://api.openai.com/v1/chat/completions",
29
+ "model": "gpt-4-1106-preview",
30
+ "key_env": "OPENAI_API_KEY",
31
+ "response_format": None,
32
+ "extra_headers": {},
33
+ },
34
+ "Mistral Small": {
35
+ "api_url": "https://openrouter.ai/api/v1/chat/completions",
36
+ "model": "mistralai/mistral-small-3.1-24b-instruct:free",
37
+ "key_env": "OPENROUTER_API_KEY",
38
+ "response_format": {"type": "json_object"},
39
+ "extra_headers": {
40
+ "HTTP-Referer": "https://huggingface.co",
41
+ "X-Title": "Invoice Extractor",
42
+ },
43
+ },
44
+ }
45
 
46
+ def get_api_key(model_choice):
47
+ key = os.getenv(MODELS[model_choice]["key_env"])
48
+ if not key:
49
+ st.error(f"❌ {MODELS[model_choice]['key_env']} not set")
50
+ st.stop()
51
+ return key
52
+
53
+ def query_llm(model_choice, prompt):
54
+ cfg = MODELS[model_choice]
55
  headers = {
56
+ "Authorization": f"Bearer {get_api_key(model_choice)}",
57
+ "Content-Type": "application/json",
58
+ }
59
+ if cfg.get("extra_headers"):
60
+ headers.update(cfg["extra_headers"])
61
+ payload = {
62
+ "model": cfg["model"],
63
+ "messages": [{"role": "user", "content": prompt}],
64
+ "temperature": 0.1,
65
+ "max_tokens": 2000,
66
  }
67
+ if cfg.get("response_format"):
68
+ payload["response_format"] = cfg["response_format"]
69
+ try:
70
+ with st.spinner(f"🔍 Querying {model_choice}..."):
71
+ r = requests.post(cfg["api_url"], headers=headers, json=payload, timeout=90)
72
+ if r.status_code != 200:
73
+ if "No instances available" in r.text or r.status_code == 503:
74
+ st.error(f"{model_choice} is currently unavailable. Please try again later or select another model.")
75
+ else:
76
+ st.error(f"🚨 API Error {r.status_code}: {r.text}")
77
+ return None
78
+ content = r.json()["choices"][0]["message"]["content"]
79
+ st.session_state.last_api = content
80
+ st.session_state.last_raw = r.text
81
+ return content
82
+ except Exception as e:
83
+ st.error(f"Connection error: {e}")
84
+ return None
85
 
86
+ def clean_json_response(text):
87
+ if not text:
88
+ return None
89
+ orig = text
90
+ text = re.sub(r'```(?:json)?', '', text).strip()
91
+ start, end = text.find('{'), text.rfind('}') + 1
92
+ if start < 0 or end < 1:
93
+ st.error("Couldn't locate JSON in response.")
94
+ st.code(orig)
95
+ return None
96
+ frag = text[start:end]
97
+ frag = re.sub(r',\s*([}\]])', r'\1', frag)
98
+ try:
99
+ return json.loads(frag)
100
+ except json.JSONDecodeError as e:
101
+ repaired = re.sub(r'"\s*"\s*(?="[^"]+"\s*:)', '","', frag)
102
+ try:
103
+ return json.loads(repaired)
104
+ except json.JSONDecodeError:
105
+ st.error(f"JSON parse error: {e}")
106
+ st.code(frag)
107
+ return None
108
 
109
+ def fallback_supplier(text):
110
+ for line in text.splitlines():
111
+ line = line.strip()
112
+ if line:
113
+ return line
114
+ return None
115
 
116
+ def get_extraction_prompt(model_choice, txt):
117
+ # (no change, reuse as before)
118
+ return (
119
+ # [--- omitted for brevity; keep as is ---]
120
+ "\nInvoice Text:\n"
121
+ f"{txt}"
122
+ )
123
 
124
+ def extract_invoice_info(model_choice, text):
125
+ prompt = get_extraction_prompt(model_choice, text)
126
+ raw = query_llm(model_choice, prompt)
127
+ if not raw:
128
  return None
129
+ data = clean_json_response(raw)
130
+ if not data:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  return None
 
 
 
132
 
133
+ # (no change, reuse as before)
134
+ if model_choice.startswith("DeepSeek"):
135
+ header = {k: v for k, v in data.items() if k != "line_items"}
136
+ items = data.get("line_items", [])
137
+ if not isinstance(items, list):
138
+ items = []
139
+ for itm in items:
140
+ if not isinstance(itm, dict):
141
+ continue
142
+ for k in ("description","quantity","unit_price","total_price"):
143
+ itm.setdefault(k, None)
144
+ return {"invoice_header": header, "line_items": items}
145
+ hdr = data.get("invoice_header", {})
146
+ if not hdr and any(k in data for k in ("invoice_number","supplier_name","customer_name")):
147
+ hdr = data
148
+ for k in ("invoice_number","invoice_date","po_number","invoice_value","supplier_name","customer_name"):
149
+ hdr.setdefault(k, None)
150
+ if not hdr.get("supplier_name"):
151
+ hdr["supplier_name"] = fallback_supplier(text)
152
+ items = data.get("line_items", [])
153
+ if not isinstance(items, list):
154
+ items = []
155
+ for itm in items:
156
+ if not isinstance(itm, dict):
157
+ continue
158
+ for k in ("item_number","description","quantity","unit_price","total_price"):
159
+ itm.setdefault(k, None)
160
+ return {"invoice_header": hdr, "line_items": items}
161
 
162
+ # --------- UNSTRACT API PDF-TO-TEXT HELPER ---------
163
+ UNSTRACT_BASE = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
164
+ UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY") # Set this in your environment!
165
 
166
+ def extract_text_from_pdf_unstract(pdf_file):
167
+ headers = {"unstract-key": UNSTRACT_API_KEY}
168
+ # Step 1: POST /whisper with the PDF
169
+ files = {"file": pdf_file}
170
+ whisper_url = f"{UNSTRACT_BASE}/whisper"
171
+ with st.spinner("Uploading and processing PDF with Unstract..."):
172
+ r = requests.post(whisper_url, files=files, headers=headers)
173
+ if r.status_code != 202:
174
+ st.error(f"Unstract: Error uploading PDF: {r.status_code} - {r.text}")
175
+ return None
176
+ whisper_hash = r.json().get("whisper_hash")
177
+ if not whisper_hash:
178
+ st.error("Unstract: No whisper_hash received.")
179
+ return None
180
+
181
+ # Step 2: Poll /whisper-status until processed
182
+ status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
183
+ for i in range(30): # Wait up to ~30 x 2 = 60 seconds
184
+ status_r = requests.get(status_url, headers=headers)
185
+ if status_r.status_code != 200:
186
+ st.error(f"Unstract: Error checking status: {status_r.status_code} - {status_r.text}")
187
+ return None
188
+ status = status_r.json().get("status")
189
+ if status == "processed":
190
+ break
191
+ st.info(f"Unstract status: {status or 'waiting'}... ({i+1})")
192
+ time.sleep(2)
193
  else:
194
+ st.error("Unstract: Timeout waiting for OCR to finish.")
195
+ return None
196
+
197
+ # Step 3: GET /whisper-retrieve?whisper_hash=...&text_only=true
198
+ retrieve_url = f"{UNSTRACT_BASE}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
199
+ r = requests.get(retrieve_url, headers=headers)
200
+ if r.status_code != 200:
201
+ st.error(f"Unstract: Error retrieving extracted text: {r.status_code} - {r.text}")
202
+ return None
203
+ return r.json().get("result_text") or r.text
204
+
205
+ # --------- INVOICE EXTRACTOR UI ---------
206
+ st.title("Invoice Extractor")
207
+ mdl = st.selectbox("Model", list(MODELS.keys()), key="extract_model")
208
+ inv_pdf = st.file_uploader("Invoice PDF", type="pdf")
209
+ extracted_info = None
210
+
211
+ if st.button("Extract") and inv_pdf:
212
+ with st.spinner("Extracting text from PDF using Unstract..."):
213
+ text = extract_text_from_pdf_unstract(inv_pdf)
214
+ if text:
215
+ extracted_info = extract_invoice_info(mdl, text)
216
+ if extracted_info:
217
+ st.success("Extraction Complete")
218
+ st.subheader("Invoice Metadata")
219
+ st.table([{k.replace("_", " ").title(): v for k, v in extracted_info["invoice_header"].items()}])
220
+ st.subheader("Line Items")
221
+ st.table(extracted_info["line_items"])
222
+ st.session_state["last_extracted_info"] = extracted_info # store in session
223
+
224
+ # If we've already extracted info, or in this session, show further controls
225
+ extracted_info = extracted_info or st.session_state.get("last_extracted_info", None)
226
+ if extracted_info:
227
+ st.markdown("---")
228
+ st.subheader("📝 Fine-tune Extracted Data with Your Own Prompt")
229
+ user_prompt = st.text_area(
230
+ "Enter your prompt for further processing or transformation (the extracted JSON will be available as context).",
231
+ height=120,
232
+ key="custom_prompt"
233
+ )
234
+ model_2 = st.selectbox("Model for Fine-Tuning Prompt", list(MODELS.keys()), key="refine_model")
235
+ if st.button("Run Custom Prompt"):
236
+ refine_input = (
237
+ "Here is an extracted invoice in JSON format:\n"
238
+ f"{json.dumps(extracted_info, indent=2)}\n"
239
+ "Follow this instruction and return the result as a JSON object only (no explanation):\n"
240
+ f"{user_prompt}"
241
+ )
242
+ result = query_llm(model_2, refine_input)
243
+ refined_json = clean_json_response(result)
244
+ st.subheader("Fine-Tuned Output")
245
+ if refined_json:
246
+ st.json(refined_json)
247
+ else:
248
+ st.error("Could not parse a valid JSON output from the model.")
249
+ st.caption("The prompt is run on the above-extracted fields as JSON. Try instructions like: 'Add a new field for net_amount (amount minus tax) to each line item', or 'Summarize the total quantity ordered', etc.")
250
 
251
+ if "last_api" in st.session_state:
252
+ with st.expander("Debug"):
253
+ st.code(st.session_state.last_api)
254
+ st.code(st.session_state.last_raw)