Spaces:

Toulik
/

MagicFile

Sleeping

App Files Files Community

Toulik commited on Sep 19, 2025

Commit

4975bf7

verified ·

1 Parent(s): abc5d71

Update app.py

Browse files

Files changed (1) hide show

app.py +376 -140

app.py CHANGED Viewed

@@ -1,4 +1,24 @@
 # app.py
 import os
 import json
 import tempfile
@@ -10,32 +30,61 @@ import gradio as gr
 from PIL import Image
 import fitz  # PyMuPDF
 import pytesseract
-# pdf2image is optional here, we used PyMuPDF for PDF -> image rendering fallback
-# from pdf2image import convert_from_path
-# OpenAI new client
 from openai import OpenAI
 # -----------------------
-# Configuration / Client
 # -----------------------
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 if not OPENAI_API_KEY:
-    raise RuntimeError("OPENAI_API_KEY not found in environment. Add it to Secrets in HF Space or set env var.")
-# Create the new OpenAI client (new API surface for openai>=1.0.0)
 client = OpenAI(api_key=OPENAI_API_KEY)
-LLM_MODEL = os.getenv("OPENAI_MODEL", "gpt-5")  # change to your available model id if needed
-EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small")  # optional
-# ----------------------
-# Text extraction utils
-# ----------------------
 def extract_text_from_pdf(path: str) -> str:
-    """
-    Extract text using PyMuPDF. If a page has no extractable text, render to image and OCR with pytesseract.
-    """
     try:
         doc = fitz.open(path)
     except Exception as e:
@@ -48,7 +97,7 @@ def extract_text_from_pdf(path: str) -> str:
         if txt:
             texts.append(txt)
         else:
-            # fallback to render page and OCR
             pix = page.get_pixmap(dpi=200)
             with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
                 pix.save(tmp.name)
@@ -62,9 +111,6 @@ def extract_text_from_image(path: str) -> str:
     return pytesseract.image_to_string(img).strip()
-# ----------------------
-# Chunker
-# ----------------------
 def chunk_text(text: str, max_chars: int = 3000) -> List[str]:
     paragraphs = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
     chunks: List[str] = []
@@ -80,109 +126,22 @@ def chunk_text(text: str, max_chars: int = 3000) -> List[str]:
         chunks.append(current)
     return chunks
-# ----------------------
-# OpenAI LLM & embeddings helpers (new client surface)
-# ----------------------
-def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str]) -> Dict[str, Any]:
-    """
-    Prompt GPT-5 to return a single JSON object matching the schema the user specified.
-    We ask the model to return JSON only. We do a best-effort parse and return structured dict.
-    """
-    prompt_intro = (
-        "You are an automated document taxonomy and tagging assistant for enterprise catalogs.\n\n"
-        f"Document title: {title}\n\n"
-        f"Short document text (first ~1000 chars): {short_text}\n\n"
-        "Top content chunks (short):\n"
-    )
-    prompt_chunks = ""
-    for i, c in enumerate(top_chunks[:6]):
-        chunk_text_clean = c[:800].replace("\n", " ")
-        prompt_chunks += f"CHUNK_{i+1}: {chunk_text_clean}\n\n"
-    prompt_end = (
-        "Task: Produce a single JSON object (machine parseable) with EXACT keys:\n"
-        "doc_id, title, summary, doc_type, source, tags (array of strings), tag_confidences (map tag->float), "
-        "taxonomy_path (array of strings), extracted_entities (map), raw_url, ingest_timestamp\n\n"
-        "Guidelines:\n"
-        "- summary: 1-2 sentences summarizing the doc.\n"
-        "- doc_type: short enum-like string (e.g., architecture_comparison, whitepaper, design_doc)\n"
-        "- tags: up to 8 short tags like arch:docai, topic:ocr-parsing\n"
-        "- tag_confidences: map with floats 0-1 for each tag\n"
-        "- taxonomy_path: hierarchical list, e.g. [\"Technology\",\"Document Processing\",\"OCR & Parsing\"]\n"
-        "- extracted_entities: map with keys like platforms, tools (each is an array)\n"
-        "- raw_url: if not available, return an empty string\n"
-        "- ingest_timestamp: ISO8601 with timezone (e.g., 2025-09-19T09:13:00+05:30)\n\n"
-        "OUTPUT: ONLY THE JSON OBJECT. DO NOT PROVIDE ANY ADDITIONAL TEXT.\n"
-    )
-    prompt = prompt_intro + prompt_chunks + prompt_end
-    # Call using new client
-    try:
-        resp = client.chat.completions.create(
-                model=LLM_MODEL,
-                messages=[{"role": "user", "content": prompt}],
-                max_completion_tokens=1500,
-                seed=42,   # optional: for reproducibility
-            )
-    except Exception as e:
-        return {"_api_error": True, "error": f"OpenAI API call failed: {e}"}
-    # Extract text robustly
-    try:
-        text = resp.choices[0].message["content"].strip()
-    except Exception:
-        # fallback attribute access if response uses attribute objects
-        try:
-            text = resp.choices[0].message.content.strip()
-        except Exception:
-            text = str(resp)
-    # Try to extract JSON block
-    m = re.search(r"\{[\s\S]*\}$", text)
-    json_text = m.group(0) if m else text
-    try:
-        data = json.loads(json_text)
-    except Exception:
-        data = {"_parsing_error": True, "raw_output": text}
-    return data
-def get_embeddings_for_chunks(chunks: List[str], model: str = EMBEDDING_MODEL) -> List[List[float]]:
-    try:
-        resp = client.embeddings.create(model=model, input=chunks)
-    except Exception as e:
-        raise RuntimeError(f"Embeddings API call failed: {e}")
-    # resp.data is an array of objects containing .embedding
-    try:
-        return [item.embedding for item in resp.data]
-    except Exception:
-        # fallback to dict-like access
-        return [item["embedding"] for item in resp.data]
-# ----------------------
-# Robust uploader helper + processing
-# ----------------------
 def save_uploaded_to_tmp(file_obj):
     """
-    Accepts multiple upload types commonly returned by gradio:
-      - file-like object with .read()
-      - dict-like {"name": "...", "data": b'...'}
-      - path string (existing file path)
-      - objects with a .name attribute pointing to a saved path (NamedString)
-    Returns (tmp_path, original_name)
     """
-    # Case 1: file-like object with .read()
     if hasattr(file_obj, "read") and callable(getattr(file_obj, "read")):
         try:
             content = file_obj.read()
-            # sometimes content may be str
             if isinstance(content, str):
                 content = content.encode("utf-8")
             name = getattr(file_obj, "name", "uploaded_file")
@@ -193,7 +152,7 @@ def save_uploaded_to_tmp(file_obj):
         except Exception:
             pass
-    # Case 2: dict-like returned by some gradio versions
     if isinstance(file_obj, dict):
         if "data" in file_obj and "name" in file_obj:
             data = file_obj["data"]
@@ -205,7 +164,7 @@ def save_uploaded_to_tmp(file_obj):
                 tmp.write(data)
                 return tmp.name, os.path.basename(name)
-    # Case 3: file_obj is a path string
     if isinstance(file_obj, str):
         if os.path.exists(file_obj):
             return file_obj, os.path.basename(file_obj)
@@ -219,7 +178,7 @@ def save_uploaded_to_tmp(file_obj):
         except Exception:
             pass
-    # Case 4: object has .name attribute referencing a real path (NamedString)
     name = getattr(file_obj, "name", None)
     if name and isinstance(name, str):
         try:
@@ -232,20 +191,182 @@ def save_uploaded_to_tmp(file_obj):
         except Exception:
             pass
-    raise ValueError(f"Unsupported uploaded file object type: {type(file_obj)}. Value repr: {repr(file_obj)[:400]}")
-def process_file(file_obj) -> Dict[str, Any]:
     """
-    Orchestrates saving uploaded file, extracting text, chunking, calling LLM and post-processing.
-    Returns: metadata dict or {"error": "..."} on failure.
     """
     try:
         tmp_path, orig_name = save_uploaded_to_tmp(file_obj)
     except Exception as e:
         return {"error": f"Failed to save uploaded file: {e}"}
-    # Extract text
     try:
         if orig_name.lower().endswith(".pdf"):
             extracted_text = extract_text_from_pdf(tmp_path)
@@ -257,23 +378,28 @@ def process_file(file_obj) -> Dict[str, Any]:
     if not extracted_text:
         return {"error": "No text found in document after extraction."}
-    # Chunk and pick top chunks
     chunks = chunk_text(extracted_text)
     sorted_chunks = sorted(chunks, key=lambda x: len(x), reverse=True)
     top_chunks = sorted_chunks[:6] if sorted_chunks else [extracted_text[:2000]]
     short_text = (extracted_text[:1000] + "...") if len(extracted_text) > 1000 else extracted_text
-    # Call LLM to get JSON metadata
-    metadata = call_gpt5_for_metadata(orig_name, short_text, top_chunks)
     if metadata.get("_api_error"):
         return {"error": metadata.get("error")}
     if metadata.get("_parsing_error"):
-        return {"error": "LLM output parsing failed. See raw_output.", "raw_output": metadata.get("raw_output")}
-    # Ensure required keys and add ingestion timestamp if missing
     now = datetime.datetime.now(datetime.timezone.utc).astimezone().isoformat()
     metadata.setdefault("doc_id", os.path.splitext(orig_name)[0])
     metadata.setdefault("title", orig_name)
@@ -284,9 +410,79 @@ def process_file(file_obj) -> Dict[str, Any]:
     return metadata
-# ----------------------
 # Gradio UI
-# ----------------------
 with gr.Blocks(title="DocClassify — Gradio GPT-5 Taxonomy & Tagging") as demo:
     gr.Markdown("## 📂 Upload a PDF or Image — the app will classify, tag, and propose a taxonomy using GPT-5")
     with gr.Row():
@@ -295,28 +491,68 @@ with gr.Blocks(title="DocClassify — Gradio GPT-5 Taxonomy & Tagging") as demo:
             run_button = gr.Button("Process document")
             status = gr.Textbox(label="Status", value="", interactive=False)
             download_button = gr.File(label="Download metadata JSON", visible=False)
         with gr.Column(scale=1):
             output_json = gr.JSON(label="Document metadata (JSON)")
-    def on_process(file_obj):
-        status.value = "Processing..."
         try:
             result = process_file(file_obj)
         except Exception as e:
-            return gr.update(value={}), gr.update(value=f"Failed: {e}"), None
         if result.get("error"):
-            return gr.update(value={"error": result.get("error"), "raw_output": result.get("raw_output", "")}), gr.update(value=f"Error: {result.get('error')}"), None
-        # create a temp json file for download
         tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
         with open(tmpf.name, "w", encoding="utf8") as f:
             json.dump(result, f, indent=2, ensure_ascii=False)
-        return gr.update(value=result), gr.update(value="Done"), tmpf.name
-    run_button.click(on_process, inputs=[uploader], outputs=[output_json, status, download_button])
-# Launch
 if __name__ == "__main__":
     demo.launch()

 # app.py
+"""
+Gradio app: upload PDF / Image -> extract text (PyMuPDF + Tesseract fallback) ->
+call GPT-5 (OpenAI new client) to produce machine-parseable metadata JSON (between markers) ->
+validate JSON (jsonschema) -> show JSON and allow download.
+Requirements (add to requirements.txt for HF Space or local venv):
+  gradio>=3.0
+  PyMuPDF
+  pytesseract
+  Pillow
+  openai>=1.0.0
+  jsonschema
+System packages required (HF Spaces apt-packages):
+  tesseract-ocr
+  poppler-utils
+Put OPENAI_API_KEY into your environment/Space Secrets.
+"""
 import os
 import json
 import tempfile
 from PIL import Image
 import fitz  # PyMuPDF
 import pytesseract
+from jsonschema import validate as json_validate, ValidationError
+# new OpenAI client surface
 from openai import OpenAI
 # -----------------------
+# Config / client
 # -----------------------
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 if not OPENAI_API_KEY:
+    raise RuntimeError("OPENAI_API_KEY not found in environment. Add to HF Space Secrets or env var.")
 client = OpenAI(api_key=OPENAI_API_KEY)
+LLM_MODEL = os.getenv("OPENAI_MODEL", "gpt-5")  # change if you have a different model id
+MAX_COMPLETION_TOKENS = int(os.getenv("MAX_COMPLETION_TOKENS", "1500"))
+# -----------------------
+# JSON schema for validation
+# -----------------------
+METADATA_SCHEMA = {
+    "type": "object",
+    "required": [
+        "doc_id",
+        "title",
+        "summary",
+        "doc_type",
+        "source",
+        "tags",
+        "tag_confidences",
+        "taxonomy_path",
+        "extracted_entities",
+        "raw_url",
+        "ingest_timestamp",
+    ],
+    "properties": {
+        "doc_id": {"type": "string"},
+        "title": {"type": "string"},
+        "summary": {"type": "string"},
+        "doc_type": {"type": "string"},
+        "source": {"type": "string"},
+        "tags": {"type": "array", "items": {"type": "string"}},
+        "tag_confidences": {"type": "object"},
+        "taxonomy_path": {"type": "array", "items": {"type": "string"}},
+        "extracted_entities": {"type": "object"},
+        "raw_url": {"type": "string"},
+        "ingest_timestamp": {"type": "string"},
+    },
+    "additionalProperties": True,
+}
+# -----------------------
+# Extraction helpers
+# -----------------------
 def extract_text_from_pdf(path: str) -> str:
     try:
         doc = fitz.open(path)
     except Exception as e:
         if txt:
             texts.append(txt)
         else:
+            # render and OCR
             pix = page.get_pixmap(dpi=200)
             with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
                 pix.save(tmp.name)
     return pytesseract.image_to_string(img).strip()
 def chunk_text(text: str, max_chars: int = 3000) -> List[str]:
     paragraphs = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
     chunks: List[str] = []
         chunks.append(current)
     return chunks
+# -----------------------
+# Utilities for robust upload handling
+# -----------------------
 def save_uploaded_to_tmp(file_obj):
     """
+    Accepts common Gradio upload types:
+      - file-like (has .read())
+      - dict-like {"name": ..., "data": b'...'}
+      - path string
+      - objects with .name attribute pointing to a path (NamedString)
+    Returns (tmp_path, original_filename)
     """
+    # file-like
     if hasattr(file_obj, "read") and callable(getattr(file_obj, "read")):
         try:
             content = file_obj.read()
             if isinstance(content, str):
                 content = content.encode("utf-8")
             name = getattr(file_obj, "name", "uploaded_file")
         except Exception:
             pass
+    # dict-like
     if isinstance(file_obj, dict):
         if "data" in file_obj and "name" in file_obj:
             data = file_obj["data"]
                 tmp.write(data)
                 return tmp.name, os.path.basename(name)
+    # path string
     if isinstance(file_obj, str):
         if os.path.exists(file_obj):
             return file_obj, os.path.basename(file_obj)
         except Exception:
             pass
+    # object with .name attribute referencing existing path
     name = getattr(file_obj, "name", None)
     if name and isinstance(name, str):
         try:
         except Exception:
             pass
+    raise ValueError(f"Unsupported uploaded file object type: {type(file_obj)}. repr: {repr(file_obj)[:400]}")
+# -----------------------
+# JSON extraction & validation helpers
+# -----------------------
+def extract_json_from_text(text: str) -> str:
+    """
+    Prefer explicit markers <<BEGIN_JSON>> ... <<END_JSON>>.
+    Otherwise try to get the last {...} block, then first {...} block.
     """
+    m = re.search(r"<<BEGIN_JSON>>(.*?)<<END_JSON>>", text, re.DOTALL)
+    if m:
+        return m.group(1).strip()
+    m2 = re.search(r"\{[\s\S]*\}$", text)
+    if m2:
+        return m2.group(0)
+    m3 = re.search(r"\{[\s\S]*?\}", text)
+    if m3:
+        return m3.group(0)
+    return ""
+def try_parse_and_validate(json_text: str) -> (bool, Dict[str, Any], str):
+    """
+    Returns (ok, parsed_dict_or_none, error_message_or_empty)
     """
+    try:
+        parsed = json.loads(json_text)
+    except Exception as e:
+        return False, None, f"json.loads error: {e}"
+    try:
+        json_validate(parsed, METADATA_SCHEMA)
+    except ValidationError as e:
+        return False, parsed, f"schema validation error: {e}"
+    except Exception as e:
+        # other validation errors
+        return False, parsed, f"schema validation unexpected error: {e}"
+    return True, parsed, ""
+# -----------------------
+# LLM call with retries + repair logic
+# -----------------------
+def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str], max_attempts: int = 3) -> Dict[str, Any]:
+    """
+    Robust LLM call:
+      - uses system message to enforce JSON-only output between markers
+      - retries up to max_attempts
+      - if model returns partial/invalid JSON, asks model to repair it
+      - validates the JSON against METADATA_SCHEMA
+    Returns:
+      - valid metadata dict OR dict with keys like _parsing_error/raw_output for UI consumption
+    """
+    system_msg = (
+        "You are an automated document taxonomy and tagging assistant for enterprise catalogs. "
+        "When producing output for this task you MUST return ONLY a JSON object and NOTHING ELSE. "
+        "Wrap the JSON in explicit markers: <<BEGIN_JSON>> and <<END_JSON>>. "
+        "Do not include any commentary, explanation, or text outside those markers."
+    )
+    prompt_intro = (
+        f"Document title: {title}\n\n"
+        f"Short document text (first ~1000 chars): {short_text}\n\n"
+        "Top content chunks (short):\n"
+    )
+    prompt_chunks = ""
+    for i, c in enumerate(top_chunks[:6]):
+        chunk_text_clean = c[:800].replace("\n", " ")
+        prompt_chunks += f"CHUNK_{i+1}: {chunk_text_clean}\n\n"
+    prompt_end = (
+        "Task: Produce a single JSON object with EXACT keys:\n"
+        "doc_id, title, summary, doc_type, source, tags (array of strings), tag_confidences (map tag->float), "
+        "taxonomy_path (array of strings), extracted_entities (map), raw_url, ingest_timestamp\n\n"
+        "Guidelines:\n"
+        "- summary: 1-2 sentences.\n"
+        "- doc_type: short enum-like string (e.g., architecture_comparison).\n"
+        "- tags: up to 8 short tags like arch:docai.\n"
+        "- tag_confidences: floats 0-1 for each tag.\n"
+        "- taxonomy_path: hierarchical list.\n\n"
+        "Output MUST be the JSON only, enclosed between <<BEGIN_JSON>> and <<END_JSON>>.\n"
+    )
+    user_prompt = prompt_intro + prompt_chunks + prompt_end
+    messages = [
+        {"role": "system", "content": system_msg},
+        {"role": "user", "content": user_prompt},
+    ]
+    last_raw = None
+    for attempt in range(1, max_attempts + 1):
+        try:
+            resp = client.chat.completions.create(
+                model=LLM_MODEL,
+                messages=messages,
+                max_completion_tokens=MAX_COMPLETION_TOKENS,
+            )
+        except Exception as e:
+            return {"_api_error": True, "error": f"OpenAI API call failed: {e}"}
+        # extract text
+        try:
+            text = resp.choices[0].message["content"].strip()
+        except Exception:
+            try:
+                text = resp.choices[0].message.content.strip()
+            except Exception:
+                text = str(resp)
+        last_raw = text
+        # extract the JSON
+        json_text = extract_json_from_text(text)
+        if not json_text:
+            # prepare a repair prompt and retry if attempts left
+            if attempt < max_attempts:
+                fix_prompt = (
+                    "The previous response did not include a JSON object wrapped in <<BEGIN_JSON>> and <<END_JSON>> markers, "
+                    "or returned invalid JSON. Here is the raw output:\n\n"
+                    f"{text}\n\n"
+                    "Please return ONLY a valid JSON object wrapped between <<BEGIN_JSON>> and <<END_JSON>>. "
+                    "Do not include anything else."
+                )
+                messages = [
+                    {"role": "system", "content": system_msg},
+                    {"role": "user", "content": fix_prompt},
+                ]
+                continue
+            else:
+                return {"_parsing_error": True, "raw_output": last_raw, "error": "no JSON found between markers or as object."}
+        ok, parsed_or_partial, parse_err = try_parse_and_validate(json_text)
+        if ok:
+            return parsed_or_partial
+        else:
+            # parsed_or_partial may be dict (parsed but schema-failed) or None
+            if attempt < max_attempts:
+                repair_prompt = (
+                    "The JSON you returned is invalid or does not meet the schema. Here is the JSON you returned:\n\n"
+                    f"{json_text}\n\n"
+                    "Please return ONLY a corrected JSON object wrapped in <<BEGIN_JSON>> and <<END_JSON>> that includes the required keys: "
+                    "doc_id, title, summary, doc_type, source, tags, tag_confidences, taxonomy_path, extracted_entities, raw_url, ingest_timestamp. "
+                    "If you must guess missing fields, use reasonable defaults (empty string or empty list/map)."
+                )
+                messages = [
+                    {"role": "system", "content": system_msg},
+                    {"role": "user", "content": repair_prompt},
+                ]
+                continue
+            else:
+                return {
+                    "_parsing_error": True,
+                    "raw_output": last_raw,
+                    "parsed_partial": parsed_or_partial,
+                    "parse_error": parse_err,
+                }
+    return {"_parsing_error": True, "raw_output": last_raw or "", "error": "exhausted retries"}
+# -----------------------
+# process file (save -> extract -> chunk -> call LLM)
+# -----------------------
+def process_file(file_obj) -> Dict[str, Any]:
     try:
         tmp_path, orig_name = save_uploaded_to_tmp(file_obj)
     except Exception as e:
         return {"error": f"Failed to save uploaded file: {e}"}
+    # extract text
     try:
         if orig_name.lower().endswith(".pdf"):
             extracted_text = extract_text_from_pdf(tmp_path)
     if not extracted_text:
         return {"error": "No text found in document after extraction."}
     chunks = chunk_text(extracted_text)
     sorted_chunks = sorted(chunks, key=lambda x: len(x), reverse=True)
     top_chunks = sorted_chunks[:6] if sorted_chunks else [extracted_text[:2000]]
     short_text = (extracted_text[:1000] + "...") if len(extracted_text) > 1000 else extracted_text
+    metadata = call_gpt5_for_metadata(orig_name, short_text, top_chunks, max_attempts=3)
+    # If API error
     if metadata.get("_api_error"):
         return {"error": metadata.get("error")}
+    # If parsing/validation error, include raw_output so UI can show & repair
     if metadata.get("_parsing_error"):
+        return {
+            "error": "LLM output parsing failed. See raw_output.",
+            "raw_output": metadata.get("raw_output"),
+            "parsed_partial": metadata.get("parsed_partial"),
+            "parse_error": metadata.get("parse_error"),
+        }
+    # Ensure minimal keys and timestamp
     now = datetime.datetime.now(datetime.timezone.utc).astimezone().isoformat()
     metadata.setdefault("doc_id", os.path.splitext(orig_name)[0])
     metadata.setdefault("title", orig_name)
     return metadata
+# -----------------------
+# Repair-only function (user-triggered) - repair raw_output into valid JSON
+# -----------------------
+def repair_raw_output(raw_output: str, max_attempts: int = 2) -> Dict[str, Any]:
+    """
+    Send the raw output back to the model and ask for corrected JSON between markers.
+    This function is useful if the initial parsing failed and you want a manual 'Repair' button in UI.
+    """
+    system_msg = (
+        "You are an automated assistant. The user previously received a response that was intended to be a JSON object "
+        "but it may be malformed or contain extra text. Your job: RETURN ONLY a corrected JSON object wrapped between "
+        "<<BEGIN_JSON>> and <<END_JSON>>. Do NOT include any other text."
+    )
+    repair_prompt = (
+        "Here is the raw output that failed to parse:\n\n"
+        f"{raw_output}\n\n"
+        "Please return ONLY a corrected JSON object wrapped between <<BEGIN_JSON>> and <<END_JSON>>. "
+        "Ensure the object contains keys: doc_id, title, summary, doc_type, source, tags, tag_confidences, taxonomy_path, extracted_entities, raw_url, ingest_timestamp. "
+        "If a field is missing, use a reasonable default (empty string, empty list, or empty map)."
+    )
+    messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": repair_prompt}]
+    last_raw = None
+    for attempt in range(1, max_attempts + 1):
+        try:
+            resp = client.chat.completions.create(
+                model=LLM_MODEL,
+                messages=messages,
+                max_completion_tokens=MAX_COMPLETION_TOKENS,
+            )
+        except Exception as e:
+            return {"_api_error": True, "error": f"OpenAI API call failed: {e}"}
+        try:
+            text = resp.choices[0].message["content"].strip()
+        except Exception:
+            try:
+                text = resp.choices[0].message.content.strip()
+            except Exception:
+                text = str(resp)
+        last_raw = text
+        json_text = extract_json_from_text(text)
+        if not json_text:
+            if attempt < max_attempts:
+                messages = [
+                    {"role": "system", "content": system_msg},
+                    {"role": "user", "content": "Your previous reply did not include a JSON block. Please return ONLY the JSON wrapped in <<BEGIN_JSON>> and <<END_JSON>>."},
+                ]
+                continue
+            else:
+                return {"_parsing_error": True, "raw_output": last_raw, "error": "no JSON found after repair attempts"}
+        ok, parsed_or_partial, parse_err = try_parse_and_validate(json_text)
+        if ok:
+            return parsed_or_partial
+        else:
+            if attempt < max_attempts:
+                messages = [
+                    {"role": "system", "content": system_msg},
+                    {"role": "user", "content": "The JSON you returned is invalid. Please correct and return ONLY the JSON wrapped in <<BEGIN_JSON>> and <<END_JSON>>."},
+                ]
+                continue
+            else:
+                return {"_parsing_error": True, "raw_output": last_raw, "parsed_partial": parsed_or_partial, "parse_error": parse_err}
+    return {"_parsing_error": True, "raw_output": last_raw or "", "error": "exhausted retries"}
+# -----------------------
 # Gradio UI
+# -----------------------
 with gr.Blocks(title="DocClassify — Gradio GPT-5 Taxonomy & Tagging") as demo:
     gr.Markdown("## 📂 Upload a PDF or Image — the app will classify, tag, and propose a taxonomy using GPT-5")
     with gr.Row():
             run_button = gr.Button("Process document")
             status = gr.Textbox(label="Status", value="", interactive=False)
             download_button = gr.File(label="Download metadata JSON", visible=False)
+            repair_button = gr.Button("Repair last raw output", visible=True)
         with gr.Column(scale=1):
             output_json = gr.JSON(label="Document metadata (JSON)")
+            raw_output_box = gr.Textbox(label="Raw LLM output / parse errors", interactive=False)
+    # State holders
+    last_raw_state = gr.State(value=None)       # stores raw_output when parsing fails
+    last_metadata_file = gr.State(value=None)   # stores path to last generated metadata file (for download)
+    def on_process(file_obj, last_raw_state):
+        status = "Processing..."
+        # initial empty responses
+        empty_val = {}
         try:
             result = process_file(file_obj)
         except Exception as e:
+            return empty_val, f"Failed: {e}", None, None
         if result.get("error"):
+            # if LLM returned parsing error, store raw_output in state and show it
+            raw = result.get("raw_output", "")
+            # prepare displayed payload that includes the error note
+            display_obj = {"error": result.get("error")}
+            if result.get("parsed_partial") is not None:
+                display_obj["parsed_partial"] = result.get("parsed_partial")
+            # Save raw_output to state for potential repair
+            return display_obj, f"Error: {result.get('error')}", None, raw
+        # success: return JSON and create downloadable temp file
         tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
         with open(tmpf.name, "w", encoding="utf8") as f:
             json.dump(result, f, indent=2, ensure_ascii=False)
+        return result, "Done", tmpf.name, None
+    def on_repair(raw_output):
+        if not raw_output:
+            return {}, "No raw_output available to repair.", None
+        try:
+            repaired = repair_raw_output(raw_output, max_attempts=2)
+        except Exception as e:
+            return {}, f"Repair failed: {e}", None
+        if repaired.get("_api_error"):
+            return {}, f"Repair API error: {repaired.get('error')}", None
+        if repaired.get("_parsing_error"):
+            # still failed; show raw_output and parsed_partial
+            display = {"error": "Repair failed to produce valid JSON", "parsed_partial": repaired.get("parsed_partial")}
+            return display, "Repair failed: parsing error", None
+        # success -> create download file
+        tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
+        with open(tmpf.name, "w", encoding="utf8") as f:
+            json.dump(repaired, f, indent=2, ensure_ascii=False)
+        return repaired, "Repair succeeded", tmpf.name
+    # Wire up buttons
+    run_button.click(on_process, inputs=[uploader, last_raw_state], outputs=[output_json, status, download_button, raw_output_box])
+    repair_button.click(on_repair, inputs=[raw_output_box], outputs=[output_json, status, download_button])
+# launch
 if __name__ == "__main__":
     demo.launch()