Toulik commited on
Commit
4975bf7
·
verified ·
1 Parent(s): abc5d71

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +376 -140
app.py CHANGED
@@ -1,4 +1,24 @@
1
  # app.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import os
3
  import json
4
  import tempfile
@@ -10,32 +30,61 @@ import gradio as gr
10
  from PIL import Image
11
  import fitz # PyMuPDF
12
  import pytesseract
13
- # pdf2image is optional here, we used PyMuPDF for PDF -> image rendering fallback
14
- # from pdf2image import convert_from_path
15
 
16
- # OpenAI new client
17
  from openai import OpenAI
18
 
19
  # -----------------------
20
- # Configuration / Client
21
  # -----------------------
22
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
23
  if not OPENAI_API_KEY:
24
- raise RuntimeError("OPENAI_API_KEY not found in environment. Add it to Secrets in HF Space or set env var.")
25
 
26
- # Create the new OpenAI client (new API surface for openai>=1.0.0)
27
  client = OpenAI(api_key=OPENAI_API_KEY)
28
 
29
- LLM_MODEL = os.getenv("OPENAI_MODEL", "gpt-5") # change to your available model id if needed
30
- EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small") # optional
31
 
32
- # ----------------------
33
- # Text extraction utils
34
- # ----------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  def extract_text_from_pdf(path: str) -> str:
36
- """
37
- Extract text using PyMuPDF. If a page has no extractable text, render to image and OCR with pytesseract.
38
- """
39
  try:
40
  doc = fitz.open(path)
41
  except Exception as e:
@@ -48,7 +97,7 @@ def extract_text_from_pdf(path: str) -> str:
48
  if txt:
49
  texts.append(txt)
50
  else:
51
- # fallback to render page and OCR
52
  pix = page.get_pixmap(dpi=200)
53
  with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
54
  pix.save(tmp.name)
@@ -62,9 +111,6 @@ def extract_text_from_image(path: str) -> str:
62
  return pytesseract.image_to_string(img).strip()
63
 
64
 
65
- # ----------------------
66
- # Chunker
67
- # ----------------------
68
  def chunk_text(text: str, max_chars: int = 3000) -> List[str]:
69
  paragraphs = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
70
  chunks: List[str] = []
@@ -80,109 +126,22 @@ def chunk_text(text: str, max_chars: int = 3000) -> List[str]:
80
  chunks.append(current)
81
  return chunks
82
 
83
-
84
- # ----------------------
85
- # OpenAI LLM & embeddings helpers (new client surface)
86
- # ----------------------
87
- def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str]) -> Dict[str, Any]:
88
- """
89
- Prompt GPT-5 to return a single JSON object matching the schema the user specified.
90
- We ask the model to return JSON only. We do a best-effort parse and return structured dict.
91
- """
92
- prompt_intro = (
93
- "You are an automated document taxonomy and tagging assistant for enterprise catalogs.\n\n"
94
- f"Document title: {title}\n\n"
95
- f"Short document text (first ~1000 chars): {short_text}\n\n"
96
- "Top content chunks (short):\n"
97
- )
98
-
99
- prompt_chunks = ""
100
- for i, c in enumerate(top_chunks[:6]):
101
- chunk_text_clean = c[:800].replace("\n", " ")
102
- prompt_chunks += f"CHUNK_{i+1}: {chunk_text_clean}\n\n"
103
-
104
- prompt_end = (
105
- "Task: Produce a single JSON object (machine parseable) with EXACT keys:\n"
106
- "doc_id, title, summary, doc_type, source, tags (array of strings), tag_confidences (map tag->float), "
107
- "taxonomy_path (array of strings), extracted_entities (map), raw_url, ingest_timestamp\n\n"
108
- "Guidelines:\n"
109
- "- summary: 1-2 sentences summarizing the doc.\n"
110
- "- doc_type: short enum-like string (e.g., architecture_comparison, whitepaper, design_doc)\n"
111
- "- tags: up to 8 short tags like arch:docai, topic:ocr-parsing\n"
112
- "- tag_confidences: map with floats 0-1 for each tag\n"
113
- "- taxonomy_path: hierarchical list, e.g. [\"Technology\",\"Document Processing\",\"OCR & Parsing\"]\n"
114
- "- extracted_entities: map with keys like platforms, tools (each is an array)\n"
115
- "- raw_url: if not available, return an empty string\n"
116
- "- ingest_timestamp: ISO8601 with timezone (e.g., 2025-09-19T09:13:00+05:30)\n\n"
117
- "OUTPUT: ONLY THE JSON OBJECT. DO NOT PROVIDE ANY ADDITIONAL TEXT.\n"
118
- )
119
-
120
- prompt = prompt_intro + prompt_chunks + prompt_end
121
-
122
- # Call using new client
123
- try:
124
- resp = client.chat.completions.create(
125
- model=LLM_MODEL,
126
- messages=[{"role": "user", "content": prompt}],
127
- max_completion_tokens=1500,
128
- seed=42, # optional: for reproducibility
129
- )
130
-
131
- except Exception as e:
132
- return {"_api_error": True, "error": f"OpenAI API call failed: {e}"}
133
-
134
- # Extract text robustly
135
- try:
136
- text = resp.choices[0].message["content"].strip()
137
- except Exception:
138
- # fallback attribute access if response uses attribute objects
139
- try:
140
- text = resp.choices[0].message.content.strip()
141
- except Exception:
142
- text = str(resp)
143
-
144
- # Try to extract JSON block
145
- m = re.search(r"\{[\s\S]*\}$", text)
146
- json_text = m.group(0) if m else text
147
-
148
- try:
149
- data = json.loads(json_text)
150
- except Exception:
151
- data = {"_parsing_error": True, "raw_output": text}
152
- return data
153
-
154
-
155
- def get_embeddings_for_chunks(chunks: List[str], model: str = EMBEDDING_MODEL) -> List[List[float]]:
156
- try:
157
- resp = client.embeddings.create(model=model, input=chunks)
158
- except Exception as e:
159
- raise RuntimeError(f"Embeddings API call failed: {e}")
160
-
161
- # resp.data is an array of objects containing .embedding
162
- try:
163
- return [item.embedding for item in resp.data]
164
- except Exception:
165
- # fallback to dict-like access
166
- return [item["embedding"] for item in resp.data]
167
-
168
-
169
- # ----------------------
170
- # Robust uploader helper + processing
171
- # ----------------------
172
  def save_uploaded_to_tmp(file_obj):
173
  """
174
- Accepts multiple upload types commonly returned by gradio:
175
- - file-like object with .read()
176
- - dict-like {"name": "...", "data": b'...'}
177
- - path string (existing file path)
178
- - objects with a .name attribute pointing to a saved path (NamedString)
179
- Returns (tmp_path, original_name)
180
  """
181
- # Case 1: file-like object with .read()
182
  if hasattr(file_obj, "read") and callable(getattr(file_obj, "read")):
183
  try:
184
  content = file_obj.read()
185
- # sometimes content may be str
186
  if isinstance(content, str):
187
  content = content.encode("utf-8")
188
  name = getattr(file_obj, "name", "uploaded_file")
@@ -193,7 +152,7 @@ def save_uploaded_to_tmp(file_obj):
193
  except Exception:
194
  pass
195
 
196
- # Case 2: dict-like returned by some gradio versions
197
  if isinstance(file_obj, dict):
198
  if "data" in file_obj and "name" in file_obj:
199
  data = file_obj["data"]
@@ -205,7 +164,7 @@ def save_uploaded_to_tmp(file_obj):
205
  tmp.write(data)
206
  return tmp.name, os.path.basename(name)
207
 
208
- # Case 3: file_obj is a path string
209
  if isinstance(file_obj, str):
210
  if os.path.exists(file_obj):
211
  return file_obj, os.path.basename(file_obj)
@@ -219,7 +178,7 @@ def save_uploaded_to_tmp(file_obj):
219
  except Exception:
220
  pass
221
 
222
- # Case 4: object has .name attribute referencing a real path (NamedString)
223
  name = getattr(file_obj, "name", None)
224
  if name and isinstance(name, str):
225
  try:
@@ -232,20 +191,182 @@ def save_uploaded_to_tmp(file_obj):
232
  except Exception:
233
  pass
234
 
235
- raise ValueError(f"Unsupported uploaded file object type: {type(file_obj)}. Value repr: {repr(file_obj)[:400]}")
236
 
237
 
238
- def process_file(file_obj) -> Dict[str, Any]:
 
 
 
 
 
 
239
  """
240
- Orchestrates saving uploaded file, extracting text, chunking, calling LLM and post-processing.
241
- Returns: metadata dict or {"error": "..."} on failure.
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  try:
244
  tmp_path, orig_name = save_uploaded_to_tmp(file_obj)
245
  except Exception as e:
246
  return {"error": f"Failed to save uploaded file: {e}"}
247
 
248
- # Extract text
249
  try:
250
  if orig_name.lower().endswith(".pdf"):
251
  extracted_text = extract_text_from_pdf(tmp_path)
@@ -257,23 +378,28 @@ def process_file(file_obj) -> Dict[str, Any]:
257
  if not extracted_text:
258
  return {"error": "No text found in document after extraction."}
259
 
260
- # Chunk and pick top chunks
261
  chunks = chunk_text(extracted_text)
262
  sorted_chunks = sorted(chunks, key=lambda x: len(x), reverse=True)
263
  top_chunks = sorted_chunks[:6] if sorted_chunks else [extracted_text[:2000]]
264
 
265
  short_text = (extracted_text[:1000] + "...") if len(extracted_text) > 1000 else extracted_text
266
 
267
- # Call LLM to get JSON metadata
268
- metadata = call_gpt5_for_metadata(orig_name, short_text, top_chunks)
269
 
 
270
  if metadata.get("_api_error"):
271
  return {"error": metadata.get("error")}
272
 
 
273
  if metadata.get("_parsing_error"):
274
- return {"error": "LLM output parsing failed. See raw_output.", "raw_output": metadata.get("raw_output")}
275
-
276
- # Ensure required keys and add ingestion timestamp if missing
 
 
 
 
 
277
  now = datetime.datetime.now(datetime.timezone.utc).astimezone().isoformat()
278
  metadata.setdefault("doc_id", os.path.splitext(orig_name)[0])
279
  metadata.setdefault("title", orig_name)
@@ -284,9 +410,79 @@ def process_file(file_obj) -> Dict[str, Any]:
284
  return metadata
285
 
286
 
287
- # ----------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  # Gradio UI
289
- # ----------------------
290
  with gr.Blocks(title="DocClassify — Gradio GPT-5 Taxonomy & Tagging") as demo:
291
  gr.Markdown("## 📂 Upload a PDF or Image — the app will classify, tag, and propose a taxonomy using GPT-5")
292
  with gr.Row():
@@ -295,28 +491,68 @@ with gr.Blocks(title="DocClassify — Gradio GPT-5 Taxonomy & Tagging") as demo:
295
  run_button = gr.Button("Process document")
296
  status = gr.Textbox(label="Status", value="", interactive=False)
297
  download_button = gr.File(label="Download metadata JSON", visible=False)
 
298
  with gr.Column(scale=1):
299
  output_json = gr.JSON(label="Document metadata (JSON)")
 
 
 
 
 
300
 
301
- def on_process(file_obj):
302
- status.value = "Processing..."
 
 
303
  try:
304
  result = process_file(file_obj)
305
  except Exception as e:
306
- return gr.update(value={}), gr.update(value=f"Failed: {e}"), None
307
 
308
  if result.get("error"):
309
- return gr.update(value={"error": result.get("error"), "raw_output": result.get("raw_output", "")}), gr.update(value=f"Error: {result.get('error')}"), None
310
-
311
- # create a temp json file for download
 
 
 
 
 
 
 
312
  tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
313
  with open(tmpf.name, "w", encoding="utf8") as f:
314
  json.dump(result, f, indent=2, ensure_ascii=False)
315
 
316
- return gr.update(value=result), gr.update(value="Done"), tmpf.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
 
318
- run_button.click(on_process, inputs=[uploader], outputs=[output_json, status, download_button])
 
 
319
 
320
- # Launch
321
  if __name__ == "__main__":
322
  demo.launch()
 
1
  # app.py
2
+ """
3
+ Gradio app: upload PDF / Image -> extract text (PyMuPDF + Tesseract fallback) ->
4
+ call GPT-5 (OpenAI new client) to produce machine-parseable metadata JSON (between markers) ->
5
+ validate JSON (jsonschema) -> show JSON and allow download.
6
+
7
+ Requirements (add to requirements.txt for HF Space or local venv):
8
+ gradio>=3.0
9
+ PyMuPDF
10
+ pytesseract
11
+ Pillow
12
+ openai>=1.0.0
13
+ jsonschema
14
+
15
+ System packages required (HF Spaces apt-packages):
16
+ tesseract-ocr
17
+ poppler-utils
18
+
19
+ Put OPENAI_API_KEY into your environment/Space Secrets.
20
+ """
21
+
22
  import os
23
  import json
24
  import tempfile
 
30
  from PIL import Image
31
  import fitz # PyMuPDF
32
  import pytesseract
33
+ from jsonschema import validate as json_validate, ValidationError
 
34
 
35
+ # new OpenAI client surface
36
  from openai import OpenAI
37
 
38
  # -----------------------
39
+ # Config / client
40
  # -----------------------
41
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
42
  if not OPENAI_API_KEY:
43
+ raise RuntimeError("OPENAI_API_KEY not found in environment. Add to HF Space Secrets or env var.")
44
 
 
45
  client = OpenAI(api_key=OPENAI_API_KEY)
46
 
47
+ LLM_MODEL = os.getenv("OPENAI_MODEL", "gpt-5") # change if you have a different model id
48
+ MAX_COMPLETION_TOKENS = int(os.getenv("MAX_COMPLETION_TOKENS", "1500"))
49
 
50
+ # -----------------------
51
+ # JSON schema for validation
52
+ # -----------------------
53
+ METADATA_SCHEMA = {
54
+ "type": "object",
55
+ "required": [
56
+ "doc_id",
57
+ "title",
58
+ "summary",
59
+ "doc_type",
60
+ "source",
61
+ "tags",
62
+ "tag_confidences",
63
+ "taxonomy_path",
64
+ "extracted_entities",
65
+ "raw_url",
66
+ "ingest_timestamp",
67
+ ],
68
+ "properties": {
69
+ "doc_id": {"type": "string"},
70
+ "title": {"type": "string"},
71
+ "summary": {"type": "string"},
72
+ "doc_type": {"type": "string"},
73
+ "source": {"type": "string"},
74
+ "tags": {"type": "array", "items": {"type": "string"}},
75
+ "tag_confidences": {"type": "object"},
76
+ "taxonomy_path": {"type": "array", "items": {"type": "string"}},
77
+ "extracted_entities": {"type": "object"},
78
+ "raw_url": {"type": "string"},
79
+ "ingest_timestamp": {"type": "string"},
80
+ },
81
+ "additionalProperties": True,
82
+ }
83
+
84
+ # -----------------------
85
+ # Extraction helpers
86
+ # -----------------------
87
  def extract_text_from_pdf(path: str) -> str:
 
 
 
88
  try:
89
  doc = fitz.open(path)
90
  except Exception as e:
 
97
  if txt:
98
  texts.append(txt)
99
  else:
100
+ # render and OCR
101
  pix = page.get_pixmap(dpi=200)
102
  with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
103
  pix.save(tmp.name)
 
111
  return pytesseract.image_to_string(img).strip()
112
 
113
 
 
 
 
114
  def chunk_text(text: str, max_chars: int = 3000) -> List[str]:
115
  paragraphs = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
116
  chunks: List[str] = []
 
126
  chunks.append(current)
127
  return chunks
128
 
129
+ # -----------------------
130
+ # Utilities for robust upload handling
131
+ # -----------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  def save_uploaded_to_tmp(file_obj):
133
  """
134
+ Accepts common Gradio upload types:
135
+ - file-like (has .read())
136
+ - dict-like {"name": ..., "data": b'...'}
137
+ - path string
138
+ - objects with .name attribute pointing to a path (NamedString)
139
+ Returns (tmp_path, original_filename)
140
  """
141
+ # file-like
142
  if hasattr(file_obj, "read") and callable(getattr(file_obj, "read")):
143
  try:
144
  content = file_obj.read()
 
145
  if isinstance(content, str):
146
  content = content.encode("utf-8")
147
  name = getattr(file_obj, "name", "uploaded_file")
 
152
  except Exception:
153
  pass
154
 
155
+ # dict-like
156
  if isinstance(file_obj, dict):
157
  if "data" in file_obj and "name" in file_obj:
158
  data = file_obj["data"]
 
164
  tmp.write(data)
165
  return tmp.name, os.path.basename(name)
166
 
167
+ # path string
168
  if isinstance(file_obj, str):
169
  if os.path.exists(file_obj):
170
  return file_obj, os.path.basename(file_obj)
 
178
  except Exception:
179
  pass
180
 
181
+ # object with .name attribute referencing existing path
182
  name = getattr(file_obj, "name", None)
183
  if name and isinstance(name, str):
184
  try:
 
191
  except Exception:
192
  pass
193
 
194
+ raise ValueError(f"Unsupported uploaded file object type: {type(file_obj)}. repr: {repr(file_obj)[:400]}")
195
 
196
 
197
+ # -----------------------
198
+ # JSON extraction & validation helpers
199
+ # -----------------------
200
+ def extract_json_from_text(text: str) -> str:
201
+ """
202
+ Prefer explicit markers <<BEGIN_JSON>> ... <<END_JSON>>.
203
+ Otherwise try to get the last {...} block, then first {...} block.
204
  """
205
+ m = re.search(r"<<BEGIN_JSON>>(.*?)<<END_JSON>>", text, re.DOTALL)
206
+ if m:
207
+ return m.group(1).strip()
208
+ m2 = re.search(r"\{[\s\S]*\}$", text)
209
+ if m2:
210
+ return m2.group(0)
211
+ m3 = re.search(r"\{[\s\S]*?\}", text)
212
+ if m3:
213
+ return m3.group(0)
214
+ return ""
215
+
216
+
217
+ def try_parse_and_validate(json_text: str) -> (bool, Dict[str, Any], str):
218
+ """
219
+ Returns (ok, parsed_dict_or_none, error_message_or_empty)
220
  """
221
+ try:
222
+ parsed = json.loads(json_text)
223
+ except Exception as e:
224
+ return False, None, f"json.loads error: {e}"
225
+
226
+ try:
227
+ json_validate(parsed, METADATA_SCHEMA)
228
+ except ValidationError as e:
229
+ return False, parsed, f"schema validation error: {e}"
230
+ except Exception as e:
231
+ # other validation errors
232
+ return False, parsed, f"schema validation unexpected error: {e}"
233
+
234
+ return True, parsed, ""
235
+
236
+
237
+ # -----------------------
238
+ # LLM call with retries + repair logic
239
+ # -----------------------
240
+ def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str], max_attempts: int = 3) -> Dict[str, Any]:
241
+ """
242
+ Robust LLM call:
243
+ - uses system message to enforce JSON-only output between markers
244
+ - retries up to max_attempts
245
+ - if model returns partial/invalid JSON, asks model to repair it
246
+ - validates the JSON against METADATA_SCHEMA
247
+ Returns:
248
+ - valid metadata dict OR dict with keys like _parsing_error/raw_output for UI consumption
249
+ """
250
+ system_msg = (
251
+ "You are an automated document taxonomy and tagging assistant for enterprise catalogs. "
252
+ "When producing output for this task you MUST return ONLY a JSON object and NOTHING ELSE. "
253
+ "Wrap the JSON in explicit markers: <<BEGIN_JSON>> and <<END_JSON>>. "
254
+ "Do not include any commentary, explanation, or text outside those markers."
255
+ )
256
+
257
+ prompt_intro = (
258
+ f"Document title: {title}\n\n"
259
+ f"Short document text (first ~1000 chars): {short_text}\n\n"
260
+ "Top content chunks (short):\n"
261
+ )
262
+
263
+ prompt_chunks = ""
264
+ for i, c in enumerate(top_chunks[:6]):
265
+ chunk_text_clean = c[:800].replace("\n", " ")
266
+ prompt_chunks += f"CHUNK_{i+1}: {chunk_text_clean}\n\n"
267
+
268
+ prompt_end = (
269
+ "Task: Produce a single JSON object with EXACT keys:\n"
270
+ "doc_id, title, summary, doc_type, source, tags (array of strings), tag_confidences (map tag->float), "
271
+ "taxonomy_path (array of strings), extracted_entities (map), raw_url, ingest_timestamp\n\n"
272
+ "Guidelines:\n"
273
+ "- summary: 1-2 sentences.\n"
274
+ "- doc_type: short enum-like string (e.g., architecture_comparison).\n"
275
+ "- tags: up to 8 short tags like arch:docai.\n"
276
+ "- tag_confidences: floats 0-1 for each tag.\n"
277
+ "- taxonomy_path: hierarchical list.\n\n"
278
+ "Output MUST be the JSON only, enclosed between <<BEGIN_JSON>> and <<END_JSON>>.\n"
279
+ )
280
+
281
+ user_prompt = prompt_intro + prompt_chunks + prompt_end
282
+
283
+ messages = [
284
+ {"role": "system", "content": system_msg},
285
+ {"role": "user", "content": user_prompt},
286
+ ]
287
+
288
+ last_raw = None
289
+
290
+ for attempt in range(1, max_attempts + 1):
291
+ try:
292
+ resp = client.chat.completions.create(
293
+ model=LLM_MODEL,
294
+ messages=messages,
295
+ max_completion_tokens=MAX_COMPLETION_TOKENS,
296
+ )
297
+ except Exception as e:
298
+ return {"_api_error": True, "error": f"OpenAI API call failed: {e}"}
299
+
300
+ # extract text
301
+ try:
302
+ text = resp.choices[0].message["content"].strip()
303
+ except Exception:
304
+ try:
305
+ text = resp.choices[0].message.content.strip()
306
+ except Exception:
307
+ text = str(resp)
308
+
309
+ last_raw = text
310
+
311
+ # extract the JSON
312
+ json_text = extract_json_from_text(text)
313
+ if not json_text:
314
+ # prepare a repair prompt and retry if attempts left
315
+ if attempt < max_attempts:
316
+ fix_prompt = (
317
+ "The previous response did not include a JSON object wrapped in <<BEGIN_JSON>> and <<END_JSON>> markers, "
318
+ "or returned invalid JSON. Here is the raw output:\n\n"
319
+ f"{text}\n\n"
320
+ "Please return ONLY a valid JSON object wrapped between <<BEGIN_JSON>> and <<END_JSON>>. "
321
+ "Do not include anything else."
322
+ )
323
+ messages = [
324
+ {"role": "system", "content": system_msg},
325
+ {"role": "user", "content": fix_prompt},
326
+ ]
327
+ continue
328
+ else:
329
+ return {"_parsing_error": True, "raw_output": last_raw, "error": "no JSON found between markers or as object."}
330
+
331
+ ok, parsed_or_partial, parse_err = try_parse_and_validate(json_text)
332
+ if ok:
333
+ return parsed_or_partial
334
+ else:
335
+ # parsed_or_partial may be dict (parsed but schema-failed) or None
336
+ if attempt < max_attempts:
337
+ repair_prompt = (
338
+ "The JSON you returned is invalid or does not meet the schema. Here is the JSON you returned:\n\n"
339
+ f"{json_text}\n\n"
340
+ "Please return ONLY a corrected JSON object wrapped in <<BEGIN_JSON>> and <<END_JSON>> that includes the required keys: "
341
+ "doc_id, title, summary, doc_type, source, tags, tag_confidences, taxonomy_path, extracted_entities, raw_url, ingest_timestamp. "
342
+ "If you must guess missing fields, use reasonable defaults (empty string or empty list/map)."
343
+ )
344
+ messages = [
345
+ {"role": "system", "content": system_msg},
346
+ {"role": "user", "content": repair_prompt},
347
+ ]
348
+ continue
349
+ else:
350
+ return {
351
+ "_parsing_error": True,
352
+ "raw_output": last_raw,
353
+ "parsed_partial": parsed_or_partial,
354
+ "parse_error": parse_err,
355
+ }
356
+
357
+ return {"_parsing_error": True, "raw_output": last_raw or "", "error": "exhausted retries"}
358
+
359
+
360
+ # -----------------------
361
+ # process file (save -> extract -> chunk -> call LLM)
362
+ # -----------------------
363
+ def process_file(file_obj) -> Dict[str, Any]:
364
  try:
365
  tmp_path, orig_name = save_uploaded_to_tmp(file_obj)
366
  except Exception as e:
367
  return {"error": f"Failed to save uploaded file: {e}"}
368
 
369
+ # extract text
370
  try:
371
  if orig_name.lower().endswith(".pdf"):
372
  extracted_text = extract_text_from_pdf(tmp_path)
 
378
  if not extracted_text:
379
  return {"error": "No text found in document after extraction."}
380
 
 
381
  chunks = chunk_text(extracted_text)
382
  sorted_chunks = sorted(chunks, key=lambda x: len(x), reverse=True)
383
  top_chunks = sorted_chunks[:6] if sorted_chunks else [extracted_text[:2000]]
384
 
385
  short_text = (extracted_text[:1000] + "...") if len(extracted_text) > 1000 else extracted_text
386
 
387
+ metadata = call_gpt5_for_metadata(orig_name, short_text, top_chunks, max_attempts=3)
 
388
 
389
+ # If API error
390
  if metadata.get("_api_error"):
391
  return {"error": metadata.get("error")}
392
 
393
+ # If parsing/validation error, include raw_output so UI can show & repair
394
  if metadata.get("_parsing_error"):
395
+ return {
396
+ "error": "LLM output parsing failed. See raw_output.",
397
+ "raw_output": metadata.get("raw_output"),
398
+ "parsed_partial": metadata.get("parsed_partial"),
399
+ "parse_error": metadata.get("parse_error"),
400
+ }
401
+
402
+ # Ensure minimal keys and timestamp
403
  now = datetime.datetime.now(datetime.timezone.utc).astimezone().isoformat()
404
  metadata.setdefault("doc_id", os.path.splitext(orig_name)[0])
405
  metadata.setdefault("title", orig_name)
 
410
  return metadata
411
 
412
 
413
+ # -----------------------
414
+ # Repair-only function (user-triggered) - repair raw_output into valid JSON
415
+ # -----------------------
416
+ def repair_raw_output(raw_output: str, max_attempts: int = 2) -> Dict[str, Any]:
417
+ """
418
+ Send the raw output back to the model and ask for corrected JSON between markers.
419
+ This function is useful if the initial parsing failed and you want a manual 'Repair' button in UI.
420
+ """
421
+ system_msg = (
422
+ "You are an automated assistant. The user previously received a response that was intended to be a JSON object "
423
+ "but it may be malformed or contain extra text. Your job: RETURN ONLY a corrected JSON object wrapped between "
424
+ "<<BEGIN_JSON>> and <<END_JSON>>. Do NOT include any other text."
425
+ )
426
+
427
+ repair_prompt = (
428
+ "Here is the raw output that failed to parse:\n\n"
429
+ f"{raw_output}\n\n"
430
+ "Please return ONLY a corrected JSON object wrapped between <<BEGIN_JSON>> and <<END_JSON>>. "
431
+ "Ensure the object contains keys: doc_id, title, summary, doc_type, source, tags, tag_confidences, taxonomy_path, extracted_entities, raw_url, ingest_timestamp. "
432
+ "If a field is missing, use a reasonable default (empty string, empty list, or empty map)."
433
+ )
434
+
435
+ messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": repair_prompt}]
436
+
437
+ last_raw = None
438
+ for attempt in range(1, max_attempts + 1):
439
+ try:
440
+ resp = client.chat.completions.create(
441
+ model=LLM_MODEL,
442
+ messages=messages,
443
+ max_completion_tokens=MAX_COMPLETION_TOKENS,
444
+ )
445
+ except Exception as e:
446
+ return {"_api_error": True, "error": f"OpenAI API call failed: {e}"}
447
+
448
+ try:
449
+ text = resp.choices[0].message["content"].strip()
450
+ except Exception:
451
+ try:
452
+ text = resp.choices[0].message.content.strip()
453
+ except Exception:
454
+ text = str(resp)
455
+
456
+ last_raw = text
457
+ json_text = extract_json_from_text(text)
458
+ if not json_text:
459
+ if attempt < max_attempts:
460
+ messages = [
461
+ {"role": "system", "content": system_msg},
462
+ {"role": "user", "content": "Your previous reply did not include a JSON block. Please return ONLY the JSON wrapped in <<BEGIN_JSON>> and <<END_JSON>>."},
463
+ ]
464
+ continue
465
+ else:
466
+ return {"_parsing_error": True, "raw_output": last_raw, "error": "no JSON found after repair attempts"}
467
+
468
+ ok, parsed_or_partial, parse_err = try_parse_and_validate(json_text)
469
+ if ok:
470
+ return parsed_or_partial
471
+ else:
472
+ if attempt < max_attempts:
473
+ messages = [
474
+ {"role": "system", "content": system_msg},
475
+ {"role": "user", "content": "The JSON you returned is invalid. Please correct and return ONLY the JSON wrapped in <<BEGIN_JSON>> and <<END_JSON>>."},
476
+ ]
477
+ continue
478
+ else:
479
+ return {"_parsing_error": True, "raw_output": last_raw, "parsed_partial": parsed_or_partial, "parse_error": parse_err}
480
+
481
+ return {"_parsing_error": True, "raw_output": last_raw or "", "error": "exhausted retries"}
482
+
483
+ # -----------------------
484
  # Gradio UI
485
+ # -----------------------
486
  with gr.Blocks(title="DocClassify — Gradio GPT-5 Taxonomy & Tagging") as demo:
487
  gr.Markdown("## 📂 Upload a PDF or Image — the app will classify, tag, and propose a taxonomy using GPT-5")
488
  with gr.Row():
 
491
  run_button = gr.Button("Process document")
492
  status = gr.Textbox(label="Status", value="", interactive=False)
493
  download_button = gr.File(label="Download metadata JSON", visible=False)
494
+ repair_button = gr.Button("Repair last raw output", visible=True)
495
  with gr.Column(scale=1):
496
  output_json = gr.JSON(label="Document metadata (JSON)")
497
+ raw_output_box = gr.Textbox(label="Raw LLM output / parse errors", interactive=False)
498
+
499
+ # State holders
500
+ last_raw_state = gr.State(value=None) # stores raw_output when parsing fails
501
+ last_metadata_file = gr.State(value=None) # stores path to last generated metadata file (for download)
502
 
503
+ def on_process(file_obj, last_raw_state):
504
+ status = "Processing..."
505
+ # initial empty responses
506
+ empty_val = {}
507
  try:
508
  result = process_file(file_obj)
509
  except Exception as e:
510
+ return empty_val, f"Failed: {e}", None, None
511
 
512
  if result.get("error"):
513
+ # if LLM returned parsing error, store raw_output in state and show it
514
+ raw = result.get("raw_output", "")
515
+ # prepare displayed payload that includes the error note
516
+ display_obj = {"error": result.get("error")}
517
+ if result.get("parsed_partial") is not None:
518
+ display_obj["parsed_partial"] = result.get("parsed_partial")
519
+ # Save raw_output to state for potential repair
520
+ return display_obj, f"Error: {result.get('error')}", None, raw
521
+
522
+ # success: return JSON and create downloadable temp file
523
  tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
524
  with open(tmpf.name, "w", encoding="utf8") as f:
525
  json.dump(result, f, indent=2, ensure_ascii=False)
526
 
527
+ return result, "Done", tmpf.name, None
528
+
529
+ def on_repair(raw_output):
530
+ if not raw_output:
531
+ return {}, "No raw_output available to repair.", None
532
+ try:
533
+ repaired = repair_raw_output(raw_output, max_attempts=2)
534
+ except Exception as e:
535
+ return {}, f"Repair failed: {e}", None
536
+
537
+ if repaired.get("_api_error"):
538
+ return {}, f"Repair API error: {repaired.get('error')}", None
539
+
540
+ if repaired.get("_parsing_error"):
541
+ # still failed; show raw_output and parsed_partial
542
+ display = {"error": "Repair failed to produce valid JSON", "parsed_partial": repaired.get("parsed_partial")}
543
+ return display, "Repair failed: parsing error", None
544
+
545
+ # success -> create download file
546
+ tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
547
+ with open(tmpf.name, "w", encoding="utf8") as f:
548
+ json.dump(repaired, f, indent=2, ensure_ascii=False)
549
+
550
+ return repaired, "Repair succeeded", tmpf.name
551
 
552
+ # Wire up buttons
553
+ run_button.click(on_process, inputs=[uploader, last_raw_state], outputs=[output_json, status, download_button, raw_output_box])
554
+ repair_button.click(on_repair, inputs=[raw_output_box], outputs=[output_json, status, download_button])
555
 
556
+ # launch
557
  if __name__ == "__main__":
558
  demo.launch()