UmaKumpatla commited on
Commit
143b3ce
·
verified ·
1 Parent(s): 037acbc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +199 -47
app.py CHANGED
@@ -4,27 +4,84 @@ import json
4
  import re
5
  import requests
6
  import streamlit as st
 
7
  from docx import Document
8
  from pypdf import PdfReader
9
- import pandas as pd
10
 
 
11
  OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY", "")
12
  OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
13
  DEFAULT_MODEL = "deepseek/deepseek-chat"
14
 
15
- def extract_text_from_pdf(file_bytes):
16
- reader = PdfReader(io.BytesIO(file_bytes))
17
- texts = []
18
- for i, page in enumerate(reader.pages):
19
- text = page.extract_text() or ""
20
- text = re.sub(r"\s+", " ", text).strip()
21
- texts.append(f"[Page {i+1}] {text}")
22
- return "\n".join(texts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- def read_docx_template(file_bytes):
 
 
25
  return Document(io.BytesIO(file_bytes))
26
 
27
- def replace_placeholders_in_doc(doc, kv_pairs):
 
 
 
 
 
 
 
 
28
  pattern = re.compile(r"(
29
 
30
  \[
@@ -35,12 +92,18 @@ def replace_placeholders_in_doc(doc, kv_pairs):
35
 
36
  ?|\{([A-Z0-9_]+)\})")
37
 
38
- def repl(m):
39
- key = m.group(2) or m.group(3)
40
- return str(kv_pairs.get(key, kv_pairs.get(key.lower(), "")) or "")
 
 
 
 
41
  for p in doc.paragraphs:
42
  for r in p.runs:
43
  r.text = pattern.sub(repl, r.text)
 
 
44
  for table in doc.tables:
45
  for row in table.rows:
46
  for cell in row.cells:
@@ -48,51 +111,140 @@ def replace_placeholders_in_doc(doc, kv_pairs):
48
  for r in p.runs:
49
  r.text = pattern.sub(repl, r.text)
50
 
51
- def call_openrouter(model, system_prompt, user_prompt):
 
 
52
  if not OPENROUTER_API_KEY:
53
- raise RuntimeError("OpenRouter API key not configured.")
54
- headers = {"Authorization": f"Bearer {OPENROUTER_API_KEY}", "Content-Type": "application/json"}
 
 
 
55
  payload = {
56
  "model": model,
57
- "messages":[{"role":"system","content":system_prompt},{"role":"user","content":user_prompt}],
58
- "temperature":0.2
 
 
 
59
  }
60
- resp = requests.post(f"{OPENROUTER_BASE_URL}/chat/completions", headers=headers, json=payload)
61
- return resp.json()["choices"][0]["message"]["content"]
 
 
 
 
 
62
 
63
- SYSTEM_PROMPT = "Return JSON key-value pairs only."
64
- USER_PROMPT_TEMPLATE = "Template:\n{template_text}\nReports:\n{reports_text}\n"
65
 
66
- def get_template_text_for_prompt(doc, max_chars=4000):
67
- texts = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
68
- return "\n".join(texts)[:max_chars]
 
 
 
 
 
 
 
69
 
70
- st.set_page_config(page_title="GLR Auto-Fill", page_icon="🧾")
71
- st.title("🧾 Insurance GLR Auto-Fill")
72
- st.caption("Upload a .docx template and one or more photo report PDFs. The app will extract text, infer fields via LLM, and produce a filled document.")
 
 
 
 
73
 
74
- template_file = st.file_uploader("Upload template (.docx)", type=["docx"])
75
- pdf_files = st.file_uploader("Upload photo reports (.pdf)", type=["pdf"], accept_multiple_files=True)
76
- api_key = st.text_input("OpenRouter API Key", type="password", value=OPENROUTER_API_KEY or "")
77
- model_name = st.text_input("Model", value=DEFAULT_MODEL)
78
-
79
- if st.button("Process"):
80
- if not template_file or not pdf_files:
81
- st.error("Upload both template and PDFs")
82
- elif not api_key:
83
- st.error("Please provide your OpenRouter API key")
84
- else:
85
- os.environ["OPENROUTER_API_KEY"] = api_key
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  doc = read_docx_template(template_file.read())
 
 
87
  template_text = get_template_text_for_prompt(doc)
88
- reports_text = "\n\n".join([extract_text_from_pdf(f.read()) for f in pdf_files])
89
- user_prompt = USER_PROMPT_TEMPLATE.format(template_text=template_text, reports_text=reports_text)
 
 
 
 
 
 
 
 
 
 
 
90
  raw = call_openrouter(model_name, SYSTEM_PROMPT, user_prompt)
 
 
91
  st.code(raw, language="json")
92
- kv_pairs = json.loads(re.search(r"\{.*\}", raw, re.S).group(0))
 
 
 
 
93
  st.subheader("🔍 Extracted Key-Value Pairs")
94
- df = pd.DataFrame(list(kv_pairs.items()), columns=["Field", "Value"])
95
  st.dataframe(df, use_container_width=True)
 
 
 
96
  replace_placeholders_in_doc(doc, kv_pairs)
97
- out_buf = io.BytesIO(); doc.save(out_buf); out_buf.seek(0)
98
- st.download_button("⬇️ Download filled template", out_buf.getvalue(), "filled_template.docx")
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import re
5
  import requests
6
  import streamlit as st
7
+ import pandas as pd
8
  from docx import Document
9
  from pypdf import PdfReader
 
10
 
11
+ # ---- Config ----
12
  OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY", "")
13
  OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
14
  DEFAULT_MODEL = "deepseek/deepseek-chat"
15
 
16
+ SYSTEM_PROMPT = (
17
+ "You are an information extraction assistant for insurance claims. "
18
+ "Return JSON only. Keys should be uppercase snake case matching template placeholders where possible. "
19
+ "If a value is not found, use an empty string."
20
+ )
21
+
22
+ # Keep this one-line template prompt to avoid unterminated strings
23
+ USER_PROMPT_PREFIX = (
24
+ "Task: Extract key-value pairs to populate a DOCX insurance template. "
25
+ "Template text:\n"
26
+ )
27
+ USER_PROMPT_MIDDLE = "\n---\nPhoto report corpus:\n"
28
+ USER_PROMPT_SUFFIX = (
29
+ "\n---\nInstructions:\n"
30
+ "- Identify likely template fields (e.g., XM8_DATE_INSPECTED, CLAIM_NUMBER, INSURED, POLICY_NUMBER, "
31
+ "INSURED_P_STREET, INSURED_P_CITY, INSURED_P_STATE, INSURED_P_ZIP, DATE_OF_LOSS, CAUSE_AND_ORIGIN).\n"
32
+ "- Extract values from the photo reports and template text where possible.\n"
33
+ "- If not found, set the value to \"\".\n"
34
+ "- Return STRICT JSON only, no extra commentary."
35
+ )
36
+
37
+ # ---- Streamlit page ----
38
+ st.set_page_config(page_title="GLR Auto-Fill", page_icon="🧾", layout="centered")
39
+ st.title("🧾 Insurance GLR Auto-Fill")
40
+ st.caption("Upload a .docx template and one or more photo report PDFs. The app will extract text, infer fields via an LLM, and produce a filled document.")
41
+
42
+ # ---- UI inputs ----
43
+ template_file = st.file_uploader("Upload template (.docx)", type=["docx"])
44
+ pdf_files = st.file_uploader("Upload photo reports (.pdf)", type=["pdf"], accept_multiple_files=True)
45
+
46
+ with st.expander("Model & API settings", expanded=False):
47
+ api_key_input = st.text_input("OpenRouter API Key", type="password", value=OPENROUTER_API_KEY or "")
48
+ model_name = st.text_input("Model (OpenRouter)", value=DEFAULT_MODEL)
49
+ if api_key_input and api_key_input != OPENROUTER_API_KEY:
50
+ OPENROUTER_API_KEY = api_key_input # update runtime value
51
+
52
+
53
+ # ---- Helpers ----
54
+ def extract_text_from_pdf_bytes(file_bytes: bytes) -> str:
55
+ """Extract text from a single PDF (bytes) safely."""
56
+ out = []
57
+ try:
58
+ reader = PdfReader(io.BytesIO(file_bytes))
59
+ for i, page in enumerate(reader.pages):
60
+ try:
61
+ txt = page.extract_text() or ""
62
+ except Exception:
63
+ txt = ""
64
+ # Normalize whitespace
65
+ txt = re.sub(r"\s+", " ", txt).strip()
66
+ out.append(f"[Page {i+1}] {txt}")
67
+ except Exception as e:
68
+ out.append(f"[PDF_ERROR] {e}")
69
+ return "\n".join(out)
70
 
71
+
72
+ def read_docx_template(file_bytes: bytes) -> Document:
73
+ """Load docx from bytes."""
74
  return Document(io.BytesIO(file_bytes))
75
 
76
+
77
+ def replace_placeholders_in_doc(doc: Document, kv_pairs: dict) -> None:
78
+ """
79
+ Replace placeholders in paragraphs and tables.
80
+ Supported placeholder styles:
81
+ - [FIELD]
82
+ - [[FIELD]]
83
+ - {FIELD}
84
+ """
85
  pattern = re.compile(r"(
86
 
87
  \[
 
92
 
93
  ?|\{([A-Z0-9_]+)\})")
94
 
95
+ def repl(match: re.Match) -> str:
96
+ key = match.group(2) or match.group(3) # capture inner FIELD
97
+ # try exact, else lowercase
98
+ val = kv_pairs.get(key, kv_pairs.get(key.lower(), ""))
99
+ return "" if val is None else str(val)
100
+
101
+ # Paragraphs
102
  for p in doc.paragraphs:
103
  for r in p.runs:
104
  r.text = pattern.sub(repl, r.text)
105
+
106
+ # Tables
107
  for table in doc.tables:
108
  for row in table.rows:
109
  for cell in row.cells:
 
111
  for r in p.runs:
112
  r.text = pattern.sub(repl, r.text)
113
 
114
+
115
+ def call_openrouter(model: str, system_prompt: str, user_prompt: str) -> str:
116
+ """Call OpenRouter chat completion and return content string."""
117
  if not OPENROUTER_API_KEY:
118
+ raise RuntimeError("OpenRouter API key not configured. Set OPENROUTER_API_KEY in Secrets or enter it in settings.")
119
+ headers = {
120
+ "Authorization": f"Bearer {OPENROUTER_API_KEY}",
121
+ "Content-Type": "application/json",
122
+ }
123
  payload = {
124
  "model": model,
125
+ "messages": [
126
+ {"role": "system", "content": system_prompt},
127
+ {"role": "user", "content": user_prompt},
128
+ ],
129
+ "temperature": 0.2,
130
  }
131
+ url = f"{OPENROUTER_BASE_URL}/chat/completions"
132
+ resp = requests.post(url, headers=headers, json=payload, timeout=90)
133
+ if resp.status_code != 200:
134
+ raise RuntimeError(f"OpenRouter API error: {resp.status_code} {resp.text}")
135
+ data = resp.json()
136
+ content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
137
+ return content
138
 
 
 
139
 
140
+ def safe_parse_json(text: str) -> dict:
141
+ """
142
+ Attempt to parse JSON from model output. Model must return JSON only,
143
+ but we still guard with a substring extraction fallback.
144
+ """
145
+ # First attempt: direct parse
146
+ try:
147
+ return json.loads(text)
148
+ except json.JSONDecodeError:
149
+ pass
150
 
151
+ # Fallback: extract largest JSON block
152
+ m = re.search(r"\{(?:[^{}]|(?R))*\}", text, flags=re.S)
153
+ if m:
154
+ try:
155
+ return json.loads(m.group(0))
156
+ except json.JSONDecodeError:
157
+ pass
158
 
159
+ # Simpler fallback: first curly to last curly
160
+ first = text.find("{")
161
+ last = text.rfind("}")
162
+ if first != -1 and last != -1 and last > first:
163
+ try:
164
+ return json.loads(text[first:last + 1])
165
+ except json.JSONDecodeError:
166
+ pass
167
+
168
+ raise ValueError("Model did not return valid JSON.")
169
+
170
+
171
+ def get_template_text_for_prompt(doc: Document, max_chars: int = 6000) -> str:
172
+ """Extract plain text from docx for prompt context."""
173
+ parts = []
174
+ for p in doc.paragraphs:
175
+ t = p.text.strip()
176
+ if t:
177
+ parts.append(t)
178
+ for table in doc.tables:
179
+ for row in table.rows:
180
+ for cell in row.cells:
181
+ for p in cell.paragraphs:
182
+ t = p.text.strip()
183
+ if t:
184
+ parts.append(t)
185
+ joined = "\n".join(parts)
186
+ return joined[:max_chars]
187
+
188
+
189
+ # ---- Process button ----
190
+ if st.button("Process and Generate"):
191
+ # Basic validation
192
+ if not template_file:
193
+ st.error("Please upload a .docx template.")
194
+ st.stop()
195
+ if not pdf_files or len(pdf_files) == 0:
196
+ st.error("Please upload at least one photo report PDF.")
197
+ st.stop()
198
+ if not OPENROUTER_API_KEY:
199
+ st.error("OpenRouter API key is missing. Set it in settings.")
200
+ st.stop()
201
+
202
+ try:
203
+ # Load template
204
  doc = read_docx_template(template_file.read())
205
+
206
+ # Template text for prompt
207
  template_text = get_template_text_for_prompt(doc)
208
+
209
+ # Extract PDFs
210
+ st.info("Extracting text from PDFs...")
211
+ reports_text_all = []
212
+ for f in pdf_files:
213
+ reports_text_all.append(extract_text_from_pdf_bytes(f.read()))
214
+ reports_text = "\n\n".join(reports_text_all)
215
+
216
+ # Build user prompt without multiline literal issues
217
+ user_prompt = USER_PROMPT_PREFIX + template_text + USER_PROMPT_MIDDLE + reports_text + USER_PROMPT_SUFFIX
218
+
219
+ # Call LLM
220
+ st.info("Calling LLM to interpret fields...")
221
  raw = call_openrouter(model_name, SYSTEM_PROMPT, user_prompt)
222
+
223
+ # Show raw for debugging
224
  st.code(raw, language="json")
225
+
226
+ # Parse JSON
227
+ kv_pairs = safe_parse_json(raw)
228
+
229
+ # Preview table
230
  st.subheader("🔍 Extracted Key-Value Pairs")
231
+ df = pd.DataFrame(sorted(kv_pairs.items()), columns=["Field", "Value"])
232
  st.dataframe(df, use_container_width=True)
233
+
234
+ # Populate and export DOCX
235
+ st.info("Populating template...")
236
  replace_placeholders_in_doc(doc, kv_pairs)
237
+ out_buf = io.BytesIO()
238
+ doc.save(out_buf)
239
+ out_buf.seek(0)
240
+
241
+ st.success("Document generated successfully.")
242
+ st.download_button(
243
+ label="⬇️ Download filled template (.docx)",
244
+ data=out_buf.getvalue(),
245
+ file_name="filled_template.docx",
246
+ mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
247
+ )
248
+
249
+ except Exception as e:
250
+ st.error(f"Processing failed: {e}")