rawanessam commited on
Commit
dc65367
·
verified ·
1 Parent(s): 0bd12fb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -27
app.py CHANGED
@@ -3,10 +3,26 @@ import os
3
  import json
4
  import requests
5
  from io import BytesIO
 
6
  import pandas as pd
 
7
  import fitz # PyMuPDF
 
8
  from urllib.parse import urlparse, unquote
 
 
 
 
 
 
9
  import re
 
 
 
 
 
 
 
10
  import logging
11
 
12
  # Set up logging to see everything
@@ -95,9 +111,9 @@ def openPDF(pdf_path):
95
 
96
  def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check=None, top_margin=0, bottom_margin=0):
97
  """Ask an LLM (OpenRouter) to identify headers in the document.
98
- Returns a list of dicts: {text, page, suggested_level, confidence, body}.
99
  The function sends plain page-line strings to the LLM (including page numbers)
100
- and asks for a JSON array containing headers with suggested levels and body for the last header.
101
  """
102
  logger.info("=" * 80)
103
  logger.info("STARTING IDENTIFY_HEADERS_WITH_OPENROUTER")
@@ -141,10 +157,17 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
141
  y1 = spans[0]['bbox'][3]
142
  # if y0 < top_margin or y1 > (page_height - bottom_margin):
143
  # continue
144
- text = " ".join(s.get('text','') for s in spans).strip()
145
- if text:
 
 
146
  # prefix with page for easier mapping back
147
- lines_for_prompt.append(f"PAGE {pno+1}: {text}")
 
 
 
 
 
148
  lines_on_page += 1
149
 
150
  if lines_on_page > 0:
@@ -165,6 +188,7 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
165
  prompt = LLM_prompt + "\n\nLines:\n" + "\n".join(lines_for_prompt)
166
 
167
  logger.debug(f"Full prompt length: {len(prompt)} characters")
 
168
  print("=" * 80)
169
  print("FULL LLM PROMPT:")
170
  print(prompt)
@@ -179,11 +203,13 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
179
  logger.error(f"Could not save prompt to file: {e}")
180
 
181
  if not api_key:
 
182
  logger.error("No API key provided")
183
  return []
184
 
185
  url = "https://openrouter.ai/api/v1/chat/completions"
186
 
 
187
  headers = {
188
  "Authorization": f"Bearer {api_key}",
189
  "Content-Type": "application/json",
@@ -191,9 +217,11 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
191
  "X-Title": os.getenv("OPENROUTER_X_TITLE", "")
192
  }
193
 
 
194
  logger.info(f"Making request to OpenRouter with model: {model}")
195
  logger.debug(f"Headers (API key masked): { {k: '***' if k == 'Authorization' else v for k, v in headers.items()} }")
196
 
 
197
  body = {
198
  "model": model,
199
  "messages": [
@@ -206,9 +234,12 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
206
  ]
207
  }
208
 
 
209
  try:
 
210
  logger.debug(f"Request body (without prompt text): { {k: v if k != 'messages' else '[...prompt...]' for k, v in body.items()} }")
211
 
 
212
  resp = requests.post(
213
  url=url,
214
  headers=headers,
@@ -219,6 +250,7 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
219
  resp.raise_for_status()
220
 
221
  resp_text = resp.text
 
222
  print("=" * 80)
223
  print("FULL LLM RESPONSE:")
224
  print(resp_text)
@@ -226,6 +258,7 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
226
 
227
  logger.info(f"LLM raw response length: {len(resp_text)}")
228
 
 
229
  try:
230
  with open("llm_debug.json", "w", encoding="utf-8") as fh:
231
  fh.write(resp_text)
@@ -284,12 +317,14 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
284
 
285
  if not text_reply:
286
  logger.error("Could not extract text reply from response")
 
287
  print("=" * 80)
288
  print("FAILED TO EXTRACT TEXT REPLY. FULL RESPONSE STRUCTURE:")
289
  print(json.dumps(rj, indent=2))
290
  print("=" * 80)
291
  return []
292
 
 
293
  print("=" * 80)
294
  print("EXTRACTED TEXT REPLY:")
295
  print(text_reply)
@@ -312,7 +347,9 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
312
  except json.JSONDecodeError as e:
313
  logger.error(f"Failed to parse JSON: {e}")
314
  logger.error(f"JSON string that failed to parse: {js[:1000]}")
 
315
  try:
 
316
  import re
317
  json_pattern = r'\[\s*\{.*?\}\s*\]'
318
  matches = re.findall(json_pattern, text_reply, re.DOTALL)
@@ -338,7 +375,7 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
338
 
339
  # Log parsed results
340
  logger.info(f"Parsed {len(parsed)} header items:")
341
- for i, obj in enumerate(parsed[:10]):
342
  logger.info(f" Item {i}: {obj}")
343
 
344
  # Normalize parsed entries and return
@@ -348,24 +385,10 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
348
  page = int(obj.get('page')) if obj.get('page') else None
349
  level = obj.get('suggested_level')
350
  conf = float(obj.get('confidence') or 0)
351
- body = obj.get('body', '') # Get body content, default to empty string
352
-
353
  if t and page is not None:
354
- out.append({
355
- 'text': t,
356
- 'page': page-1,
357
- 'suggested_level': level,
358
- 'confidence': conf,
359
- 'body': body # Add body to output
360
- })
361
-
362
- logger.info(f"Returning {len(out)} valid header entries with body content for last header")
363
-
364
- # Log which entries have body content
365
- for i, item in enumerate(out):
366
- if item.get('body'):
367
- logger.info(f"Entry {i} has body content (length: {len(item['body'])})")
368
 
 
369
  return out
370
 
371
 
@@ -390,17 +413,13 @@ def identify_headers_and_save_excel(pdf_path, model, llm_prompt):
390
  logger.info("DataFrame head:")
391
  logger.info(df.head().to_string())
392
 
393
- # Check which rows have body content
394
- bodies = df['body'].tolist()
395
- non_empty_bodies = [b for b in bodies if b and str(b).strip()]
396
- logger.info(f"Found {len(non_empty_bodies)} entries with body content")
397
-
398
  # Save Excel to a file on disk
399
  output_path = "output.xlsx"
400
  try:
401
  df.to_excel(output_path, index=False, engine='openpyxl')
402
  logger.info(f"Excel file saved successfully to: {output_path}")
403
 
 
404
  if os.path.exists(output_path):
405
  file_size = os.path.getsize(output_path)
406
  logger.info(f"Output file exists, size: {file_size} bytes")
@@ -421,6 +440,7 @@ iface = gr.Interface(
421
  gr.Textbox(label="LLM Prompt")
422
  ],
423
  outputs = gr.File(file_count="single", label="Download Excel")
 
424
  )
425
 
426
  if __name__ == "__main__":
 
3
  import json
4
  import requests
5
  from io import BytesIO
6
+ import gradio as gr
7
  import pandas as pd
8
+ from io import BytesIO
9
  import fitz # PyMuPDF
10
+
11
  from urllib.parse import urlparse, unquote
12
+ import os
13
+ from io import BytesIO
14
+ import re
15
+ import requests
16
+ import pandas as pd
17
+ import fitz # PyMuPDF
18
  import re
19
+ import urllib.parse
20
+ import difflib
21
+ from fuzzywuzzy import fuzz
22
+ import copy
23
+ # import tsadropboxretrieval
24
+
25
+ import urllib.parse
26
  import logging
27
 
28
  # Set up logging to see everything
 
111
 
112
  def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check=None, top_margin=0, bottom_margin=0):
113
  """Ask an LLM (OpenRouter) to identify headers in the document.
114
+ Returns a list of dicts: {text, page, suggested_level, confidence}.
115
  The function sends plain page-line strings to the LLM (including page numbers)
116
+ and asks for a JSON array containing only header lines with suggested levels.
117
  """
118
  logger.info("=" * 80)
119
  logger.info("STARTING IDENTIFY_HEADERS_WITH_OPENROUTER")
 
157
  y1 = spans[0]['bbox'][3]
158
  # if y0 < top_margin or y1 > (page_height - bottom_margin):
159
  # continue
160
+ for s in spans:
161
+ # text,font,size,flags,color
162
+ ArrayofTextWithFormat={s.get('text')}
163
+
164
  # prefix with page for easier mapping back
165
+ lines_for_prompt.append(f"PAGE {pno+1}: {ArrayofTextWithFormat}")
166
+
167
+ # text = " ".join(s.get('text','') for s in spans).strip()
168
+ # if text:
169
+ # # prefix with page for easier mapping back
170
+ # lines_for_prompt.append(f"PAGE {pno+1}: {text}")
171
  lines_on_page += 1
172
 
173
  if lines_on_page > 0:
 
188
  prompt = LLM_prompt + "\n\nLines:\n" + "\n".join(lines_for_prompt)
189
 
190
  logger.debug(f"Full prompt length: {len(prompt)} characters")
191
+ # Changed: Print entire prompt, not truncated
192
  print("=" * 80)
193
  print("FULL LLM PROMPT:")
194
  print(prompt)
 
203
  logger.error(f"Could not save prompt to file: {e}")
204
 
205
  if not api_key:
206
+ # No API key: return empty so caller can fallback to heuristics
207
  logger.error("No API key provided")
208
  return []
209
 
210
  url = "https://openrouter.ai/api/v1/chat/completions"
211
 
212
+ # Build headers following the OpenRouter example
213
  headers = {
214
  "Authorization": f"Bearer {api_key}",
215
  "Content-Type": "application/json",
 
217
  "X-Title": os.getenv("OPENROUTER_X_TITLE", "")
218
  }
219
 
220
+ # Log request details (without exposing full API key)
221
  logger.info(f"Making request to OpenRouter with model: {model}")
222
  logger.debug(f"Headers (API key masked): { {k: '***' if k == 'Authorization' else v for k, v in headers.items()} }")
223
 
224
+ # Wrap the prompt as the example 'content' array expected by OpenRouter
225
  body = {
226
  "model": model,
227
  "messages": [
 
234
  ]
235
  }
236
 
237
+ # Debug: log request body (truncated) and write raw response for inspection
238
  try:
239
+ # Changed: Log full body (excluding prompt text which is already logged)
240
  logger.debug(f"Request body (without prompt text): { {k: v if k != 'messages' else '[...prompt...]' for k, v in body.items()} }")
241
 
242
+ # Removed timeout parameter
243
  resp = requests.post(
244
  url=url,
245
  headers=headers,
 
250
  resp.raise_for_status()
251
 
252
  resp_text = resp.text
253
+ # Changed: Print entire response
254
  print("=" * 80)
255
  print("FULL LLM RESPONSE:")
256
  print(resp_text)
 
258
 
259
  logger.info(f"LLM raw response length: {len(resp_text)}")
260
 
261
+ # Save raw response for offline inspection
262
  try:
263
  with open("llm_debug.json", "w", encoding="utf-8") as fh:
264
  fh.write(resp_text)
 
317
 
318
  if not text_reply:
319
  logger.error("Could not extract text reply from response")
320
+ # Changed: Print the entire response structure for debugging
321
  print("=" * 80)
322
  print("FAILED TO EXTRACT TEXT REPLY. FULL RESPONSE STRUCTURE:")
323
  print(json.dumps(rj, indent=2))
324
  print("=" * 80)
325
  return []
326
 
327
+ # Changed: Print the extracted text reply
328
  print("=" * 80)
329
  print("EXTRACTED TEXT REPLY:")
330
  print(text_reply)
 
347
  except json.JSONDecodeError as e:
348
  logger.error(f"Failed to parse JSON: {e}")
349
  logger.error(f"JSON string that failed to parse: {js[:1000]}")
350
+ # Try to find any JSON-like structure
351
  try:
352
+ # Try to extract any JSON array
353
  import re
354
  json_pattern = r'\[\s*\{.*?\}\s*\]'
355
  matches = re.findall(json_pattern, text_reply, re.DOTALL)
 
375
 
376
  # Log parsed results
377
  logger.info(f"Parsed {len(parsed)} header items:")
378
+ for i, obj in enumerate(parsed[:10]): # Log first 10 items
379
  logger.info(f" Item {i}: {obj}")
380
 
381
  # Normalize parsed entries and return
 
385
  page = int(obj.get('page')) if obj.get('page') else None
386
  level = obj.get('suggested_level')
387
  conf = float(obj.get('confidence') or 0)
 
 
388
  if t and page is not None:
389
+ out.append({'text': t, 'page': page-1, 'suggested_level': level, 'confidence': conf})
 
 
 
 
 
 
 
 
 
 
 
 
 
390
 
391
+ logger.info(f"Returning {len(out)} valid header entries")
392
  return out
393
 
394
 
 
413
  logger.info("DataFrame head:")
414
  logger.info(df.head().to_string())
415
 
 
 
 
 
 
416
  # Save Excel to a file on disk
417
  output_path = "output.xlsx"
418
  try:
419
  df.to_excel(output_path, index=False, engine='openpyxl')
420
  logger.info(f"Excel file saved successfully to: {output_path}")
421
 
422
+ # Verify file was created
423
  if os.path.exists(output_path):
424
  file_size = os.path.getsize(output_path)
425
  logger.info(f"Output file exists, size: {file_size} bytes")
 
440
  gr.Textbox(label="LLM Prompt")
441
  ],
442
  outputs = gr.File(file_count="single", label="Download Excel")
443
+
444
  )
445
 
446
  if __name__ == "__main__":