Spaces:

UmaKumpatla
/

GLR_Pipeline_with_Streamlit

Sleeping

App Files Files Community

UmaKumpatla commited on Nov 24, 2025

Commit

143b3ce

verified ·

1 Parent(s): 037acbc

Update app.py

Browse files

Files changed (1) hide show

app.py +199 -47

app.py CHANGED Viewed

@@ -4,27 +4,84 @@ import json
 import re
 import requests
 import streamlit as st
 from docx import Document
 from pypdf import PdfReader
-import pandas as pd
 OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY", "")
 OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
 DEFAULT_MODEL = "deepseek/deepseek-chat"
-def extract_text_from_pdf(file_bytes):
-    reader = PdfReader(io.BytesIO(file_bytes))
-    texts = []
-    for i, page in enumerate(reader.pages):
-        text = page.extract_text() or ""
-        text = re.sub(r"\s+", " ", text).strip()
-        texts.append(f"[Page {i+1}] {text}")
-    return "\n".join(texts)
-def read_docx_template(file_bytes):
     return Document(io.BytesIO(file_bytes))
-def replace_placeholders_in_doc(doc, kv_pairs):
     pattern = re.compile(r"(
 \[
@@ -35,12 +92,18 @@ def replace_placeholders_in_doc(doc, kv_pairs):
 ?|\{([A-Z0-9_]+)\})")
-    def repl(m):
-        key = m.group(2) or m.group(3)
-        return str(kv_pairs.get(key, kv_pairs.get(key.lower(), "")) or "")
     for p in doc.paragraphs:
         for r in p.runs:
             r.text = pattern.sub(repl, r.text)
     for table in doc.tables:
         for row in table.rows:
             for cell in row.cells:
@@ -48,51 +111,140 @@ def replace_placeholders_in_doc(doc, kv_pairs):
                     for r in p.runs:
                         r.text = pattern.sub(repl, r.text)
-def call_openrouter(model, system_prompt, user_prompt):
     if not OPENROUTER_API_KEY:
-        raise RuntimeError("OpenRouter API key not configured.")
-    headers = {"Authorization": f"Bearer {OPENROUTER_API_KEY}", "Content-Type": "application/json"}
     payload = {
         "model": model,
-        "messages":[{"role":"system","content":system_prompt},{"role":"user","content":user_prompt}],
-        "temperature":0.2
     }
-    resp = requests.post(f"{OPENROUTER_BASE_URL}/chat/completions", headers=headers, json=payload)
-    return resp.json()["choices"][0]["message"]["content"]
-SYSTEM_PROMPT = "Return JSON key-value pairs only."
-USER_PROMPT_TEMPLATE = "Template:\n{template_text}\nReports:\n{reports_text}\n"
-def get_template_text_for_prompt(doc, max_chars=4000):
-    texts = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
-    return "\n".join(texts)[:max_chars]
-st.set_page_config(page_title="GLR Auto-Fill", page_icon="🧾")
-st.title("🧾 Insurance GLR Auto-Fill")
-st.caption("Upload a .docx template and one or more photo report PDFs. The app will extract text, infer fields via LLM, and produce a filled document.")
-template_file = st.file_uploader("Upload template (.docx)", type=["docx"])
-pdf_files = st.file_uploader("Upload photo reports (.pdf)", type=["pdf"], accept_multiple_files=True)
-api_key = st.text_input("OpenRouter API Key", type="password", value=OPENROUTER_API_KEY or "")
-model_name = st.text_input("Model", value=DEFAULT_MODEL)
-if st.button("Process"):
-    if not template_file or not pdf_files:
-        st.error("Upload both template and PDFs")
-    elif not api_key:
-        st.error("Please provide your OpenRouter API key")
-    else:
-        os.environ["OPENROUTER_API_KEY"] = api_key
         doc = read_docx_template(template_file.read())
         template_text = get_template_text_for_prompt(doc)
-        reports_text = "\n\n".join([extract_text_from_pdf(f.read()) for f in pdf_files])
-        user_prompt = USER_PROMPT_TEMPLATE.format(template_text=template_text, reports_text=reports_text)
         raw = call_openrouter(model_name, SYSTEM_PROMPT, user_prompt)
         st.code(raw, language="json")
-        kv_pairs = json.loads(re.search(r"\{.*\}", raw, re.S).group(0))
         st.subheader("🔍 Extracted Key-Value Pairs")
-        df = pd.DataFrame(list(kv_pairs.items()), columns=["Field", "Value"])
         st.dataframe(df, use_container_width=True)
         replace_placeholders_in_doc(doc, kv_pairs)
-        out_buf = io.BytesIO(); doc.save(out_buf); out_buf.seek(0)
-        st.download_button("⬇️ Download filled template", out_buf.getvalue(), "filled_template.docx")

 import re
 import requests
 import streamlit as st
+import pandas as pd
 from docx import Document
 from pypdf import PdfReader
+# ---- Config ----
 OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY", "")
 OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
 DEFAULT_MODEL = "deepseek/deepseek-chat"
+SYSTEM_PROMPT = (
+    "You are an information extraction assistant for insurance claims. "
+    "Return JSON only. Keys should be uppercase snake case matching template placeholders where possible. "
+    "If a value is not found, use an empty string."
+)
+# Keep this one-line template prompt to avoid unterminated strings
+USER_PROMPT_PREFIX = (
+    "Task: Extract key-value pairs to populate a DOCX insurance template. "
+    "Template text:\n"
+)
+USER_PROMPT_MIDDLE = "\n---\nPhoto report corpus:\n"
+USER_PROMPT_SUFFIX = (
+    "\n---\nInstructions:\n"
+    "- Identify likely template fields (e.g., XM8_DATE_INSPECTED, CLAIM_NUMBER, INSURED, POLICY_NUMBER, "
+    "INSURED_P_STREET, INSURED_P_CITY, INSURED_P_STATE, INSURED_P_ZIP, DATE_OF_LOSS, CAUSE_AND_ORIGIN).\n"
+    "- Extract values from the photo reports and template text where possible.\n"
+    "- If not found, set the value to \"\".\n"
+    "- Return STRICT JSON only, no extra commentary."
+)
+# ---- Streamlit page ----
+st.set_page_config(page_title="GLR Auto-Fill", page_icon="🧾", layout="centered")
+st.title("🧾 Insurance GLR Auto-Fill")
+st.caption("Upload a .docx template and one or more photo report PDFs. The app will extract text, infer fields via an LLM, and produce a filled document.")
+# ---- UI inputs ----
+template_file = st.file_uploader("Upload template (.docx)", type=["docx"])
+pdf_files = st.file_uploader("Upload photo reports (.pdf)", type=["pdf"], accept_multiple_files=True)
+with st.expander("Model & API settings", expanded=False):
+    api_key_input = st.text_input("OpenRouter API Key", type="password", value=OPENROUTER_API_KEY or "")
+    model_name = st.text_input("Model (OpenRouter)", value=DEFAULT_MODEL)
+    if api_key_input and api_key_input != OPENROUTER_API_KEY:
+        OPENROUTER_API_KEY = api_key_input  # update runtime value
+# ---- Helpers ----
+def extract_text_from_pdf_bytes(file_bytes: bytes) -> str:
+    """Extract text from a single PDF (bytes) safely."""
+    out = []
+    try:
+        reader = PdfReader(io.BytesIO(file_bytes))
+        for i, page in enumerate(reader.pages):
+            try:
+                txt = page.extract_text() or ""
+            except Exception:
+                txt = ""
+            # Normalize whitespace
+            txt = re.sub(r"\s+", " ", txt).strip()
+            out.append(f"[Page {i+1}] {txt}")
+    except Exception as e:
+        out.append(f"[PDF_ERROR] {e}")
+    return "\n".join(out)
+def read_docx_template(file_bytes: bytes) -> Document:
+    """Load docx from bytes."""
     return Document(io.BytesIO(file_bytes))
+def replace_placeholders_in_doc(doc: Document, kv_pairs: dict) -> None:
+    """
+    Replace placeholders in paragraphs and tables.
+    Supported placeholder styles:
+      - [FIELD]
+      - [[FIELD]]
+      - {FIELD}
+    """
     pattern = re.compile(r"(
 \[
 ?|\{([A-Z0-9_]+)\})")
+    def repl(match: re.Match) -> str:
+        key = match.group(2) or match.group(3)  # capture inner FIELD
+        # try exact, else lowercase
+        val = kv_pairs.get(key, kv_pairs.get(key.lower(), ""))
+        return "" if val is None else str(val)
+    # Paragraphs
     for p in doc.paragraphs:
         for r in p.runs:
             r.text = pattern.sub(repl, r.text)
+    # Tables
     for table in doc.tables:
         for row in table.rows:
             for cell in row.cells:
                     for r in p.runs:
                         r.text = pattern.sub(repl, r.text)
+def call_openrouter(model: str, system_prompt: str, user_prompt: str) -> str:
+    """Call OpenRouter chat completion and return content string."""
     if not OPENROUTER_API_KEY:
+        raise RuntimeError("OpenRouter API key not configured. Set OPENROUTER_API_KEY in Secrets or enter it in settings.")
+    headers = {
+        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
+        "Content-Type": "application/json",
+    }
     payload = {
         "model": model,
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ],
+        "temperature": 0.2,
     }
+    url = f"{OPENROUTER_BASE_URL}/chat/completions"
+    resp = requests.post(url, headers=headers, json=payload, timeout=90)
+    if resp.status_code != 200:
+        raise RuntimeError(f"OpenRouter API error: {resp.status_code} {resp.text}")
+    data = resp.json()
+    content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
+    return content
+def safe_parse_json(text: str) -> dict:
+    """
+    Attempt to parse JSON from model output. Model must return JSON only,
+    but we still guard with a substring extraction fallback.
+    """
+    # First attempt: direct parse
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        pass
+    # Fallback: extract largest JSON block
+    m = re.search(r"\{(?:[^{}]|(?R))*\}", text, flags=re.S)
+    if m:
+        try:
+            return json.loads(m.group(0))
+        except json.JSONDecodeError:
+            pass
+    # Simpler fallback: first curly to last curly
+    first = text.find("{")
+    last = text.rfind("}")
+    if first != -1 and last != -1 and last > first:
+        try:
+            return json.loads(text[first:last + 1])
+        except json.JSONDecodeError:
+            pass
+    raise ValueError("Model did not return valid JSON.")
+def get_template_text_for_prompt(doc: Document, max_chars: int = 6000) -> str:
+    """Extract plain text from docx for prompt context."""
+    parts = []
+    for p in doc.paragraphs:
+        t = p.text.strip()
+        if t:
+            parts.append(t)
+    for table in doc.tables:
+        for row in table.rows:
+            for cell in row.cells:
+                for p in cell.paragraphs:
+                    t = p.text.strip()
+                    if t:
+                        parts.append(t)
+    joined = "\n".join(parts)
+    return joined[:max_chars]
+# ---- Process button ----
+if st.button("Process and Generate"):
+    # Basic validation
+    if not template_file:
+        st.error("Please upload a .docx template.")
+        st.stop()
+    if not pdf_files or len(pdf_files) == 0:
+        st.error("Please upload at least one photo report PDF.")
+        st.stop()
+    if not OPENROUTER_API_KEY:
+        st.error("OpenRouter API key is missing. Set it in settings.")
+        st.stop()
+    try:
+        # Load template
         doc = read_docx_template(template_file.read())
+        # Template text for prompt
         template_text = get_template_text_for_prompt(doc)
+        # Extract PDFs
+        st.info("Extracting text from PDFs...")
+        reports_text_all = []
+        for f in pdf_files:
+            reports_text_all.append(extract_text_from_pdf_bytes(f.read()))
+        reports_text = "\n\n".join(reports_text_all)
+        # Build user prompt without multiline literal issues
+        user_prompt = USER_PROMPT_PREFIX + template_text + USER_PROMPT_MIDDLE + reports_text + USER_PROMPT_SUFFIX
+        # Call LLM
+        st.info("Calling LLM to interpret fields...")
         raw = call_openrouter(model_name, SYSTEM_PROMPT, user_prompt)
+        # Show raw for debugging
         st.code(raw, language="json")
+        # Parse JSON
+        kv_pairs = safe_parse_json(raw)
+        # Preview table
         st.subheader("🔍 Extracted Key-Value Pairs")
+        df = pd.DataFrame(sorted(kv_pairs.items()), columns=["Field", "Value"])
         st.dataframe(df, use_container_width=True)
+        # Populate and export DOCX
+        st.info("Populating template...")
         replace_placeholders_in_doc(doc, kv_pairs)
+        out_buf = io.BytesIO()
+        doc.save(out_buf)
+        out_buf.seek(0)
+        st.success("Document generated successfully.")
+        st.download_button(
+            label="⬇️ Download filled template (.docx)",
+            data=out_buf.getvalue(),
+            file_name="filled_template.docx",
+            mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        )
+    except Exception as e:
+        st.error(f"Processing failed: {e}")