rawanessam commited on
Commit
0bd12fb
·
verified ·
1 Parent(s): 0ff391f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -47
app.py CHANGED
@@ -3,26 +3,10 @@ import os
3
  import json
4
  import requests
5
  from io import BytesIO
6
- import gradio as gr
7
  import pandas as pd
8
- from io import BytesIO
9
  import fitz # PyMuPDF
10
-
11
  from urllib.parse import urlparse, unquote
12
- import os
13
- from io import BytesIO
14
- import re
15
- import requests
16
- import pandas as pd
17
- import fitz # PyMuPDF
18
  import re
19
- import urllib.parse
20
- import difflib
21
- from fuzzywuzzy import fuzz
22
- import copy
23
- # import tsadropboxretrieval
24
-
25
- import urllib.parse
26
  import logging
27
 
28
  # Set up logging to see everything
@@ -111,9 +95,9 @@ def openPDF(pdf_path):
111
 
112
  def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check=None, top_margin=0, bottom_margin=0):
113
  """Ask an LLM (OpenRouter) to identify headers in the document.
114
- Returns a list of dicts: {text, page, suggested_level, confidence}.
115
  The function sends plain page-line strings to the LLM (including page numbers)
116
- and asks for a JSON array containing only header lines with suggested levels.
117
  """
118
  logger.info("=" * 80)
119
  logger.info("STARTING IDENTIFY_HEADERS_WITH_OPENROUTER")
@@ -157,17 +141,10 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
157
  y1 = spans[0]['bbox'][3]
158
  # if y0 < top_margin or y1 > (page_height - bottom_margin):
159
  # continue
160
- for s in spans:
161
- # text,font,size,flags,color
162
- ArrayofTextWithFormat={s.get('text')}
163
-
164
  # prefix with page for easier mapping back
165
- lines_for_prompt.append(f"PAGE {pno+1}: {ArrayofTextWithFormat}")
166
-
167
- # text = " ".join(s.get('text','') for s in spans).strip()
168
- # if text:
169
- # # prefix with page for easier mapping back
170
- # lines_for_prompt.append(f"PAGE {pno+1}: {text}")
171
  lines_on_page += 1
172
 
173
  if lines_on_page > 0:
@@ -188,7 +165,6 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
188
  prompt = LLM_prompt + "\n\nLines:\n" + "\n".join(lines_for_prompt)
189
 
190
  logger.debug(f"Full prompt length: {len(prompt)} characters")
191
- # Changed: Print entire prompt, not truncated
192
  print("=" * 80)
193
  print("FULL LLM PROMPT:")
194
  print(prompt)
@@ -203,13 +179,11 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
203
  logger.error(f"Could not save prompt to file: {e}")
204
 
205
  if not api_key:
206
- # No API key: return empty so caller can fallback to heuristics
207
  logger.error("No API key provided")
208
  return []
209
 
210
  url = "https://openrouter.ai/api/v1/chat/completions"
211
 
212
- # Build headers following the OpenRouter example
213
  headers = {
214
  "Authorization": f"Bearer {api_key}",
215
  "Content-Type": "application/json",
@@ -217,11 +191,9 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
217
  "X-Title": os.getenv("OPENROUTER_X_TITLE", "")
218
  }
219
 
220
- # Log request details (without exposing full API key)
221
  logger.info(f"Making request to OpenRouter with model: {model}")
222
  logger.debug(f"Headers (API key masked): { {k: '***' if k == 'Authorization' else v for k, v in headers.items()} }")
223
 
224
- # Wrap the prompt as the example 'content' array expected by OpenRouter
225
  body = {
226
  "model": model,
227
  "messages": [
@@ -234,12 +206,9 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
234
  ]
235
  }
236
 
237
- # Debug: log request body (truncated) and write raw response for inspection
238
  try:
239
- # Changed: Log full body (excluding prompt text which is already logged)
240
  logger.debug(f"Request body (without prompt text): { {k: v if k != 'messages' else '[...prompt...]' for k, v in body.items()} }")
241
 
242
- # Removed timeout parameter
243
  resp = requests.post(
244
  url=url,
245
  headers=headers,
@@ -250,7 +219,6 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
250
  resp.raise_for_status()
251
 
252
  resp_text = resp.text
253
- # Changed: Print entire response
254
  print("=" * 80)
255
  print("FULL LLM RESPONSE:")
256
  print(resp_text)
@@ -258,7 +226,6 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
258
 
259
  logger.info(f"LLM raw response length: {len(resp_text)}")
260
 
261
- # Save raw response for offline inspection
262
  try:
263
  with open("llm_debug.json", "w", encoding="utf-8") as fh:
264
  fh.write(resp_text)
@@ -317,14 +284,12 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
317
 
318
  if not text_reply:
319
  logger.error("Could not extract text reply from response")
320
- # Changed: Print the entire response structure for debugging
321
  print("=" * 80)
322
  print("FAILED TO EXTRACT TEXT REPLY. FULL RESPONSE STRUCTURE:")
323
  print(json.dumps(rj, indent=2))
324
  print("=" * 80)
325
  return []
326
 
327
- # Changed: Print the extracted text reply
328
  print("=" * 80)
329
  print("EXTRACTED TEXT REPLY:")
330
  print(text_reply)
@@ -347,9 +312,7 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
347
  except json.JSONDecodeError as e:
348
  logger.error(f"Failed to parse JSON: {e}")
349
  logger.error(f"JSON string that failed to parse: {js[:1000]}")
350
- # Try to find any JSON-like structure
351
  try:
352
- # Try to extract any JSON array
353
  import re
354
  json_pattern = r'\[\s*\{.*?\}\s*\]'
355
  matches = re.findall(json_pattern, text_reply, re.DOTALL)
@@ -375,7 +338,7 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
375
 
376
  # Log parsed results
377
  logger.info(f"Parsed {len(parsed)} header items:")
378
- for i, obj in enumerate(parsed[:10]): # Log first 10 items
379
  logger.info(f" Item {i}: {obj}")
380
 
381
  # Normalize parsed entries and return
@@ -385,10 +348,24 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
385
  page = int(obj.get('page')) if obj.get('page') else None
386
  level = obj.get('suggested_level')
387
  conf = float(obj.get('confidence') or 0)
 
 
388
  if t and page is not None:
389
- out.append({'text': t, 'page': page-1, 'suggested_level': level, 'confidence': conf})
 
 
 
 
 
 
 
 
 
 
 
 
 
390
 
391
- logger.info(f"Returning {len(out)} valid header entries")
392
  return out
393
 
394
 
@@ -413,13 +390,17 @@ def identify_headers_and_save_excel(pdf_path, model, llm_prompt):
413
  logger.info("DataFrame head:")
414
  logger.info(df.head().to_string())
415
 
 
 
 
 
 
416
  # Save Excel to a file on disk
417
  output_path = "output.xlsx"
418
  try:
419
  df.to_excel(output_path, index=False, engine='openpyxl')
420
  logger.info(f"Excel file saved successfully to: {output_path}")
421
 
422
- # Verify file was created
423
  if os.path.exists(output_path):
424
  file_size = os.path.getsize(output_path)
425
  logger.info(f"Output file exists, size: {file_size} bytes")
@@ -440,7 +421,6 @@ iface = gr.Interface(
440
  gr.Textbox(label="LLM Prompt")
441
  ],
442
  outputs = gr.File(file_count="single", label="Download Excel")
443
-
444
  )
445
 
446
  if __name__ == "__main__":
 
3
  import json
4
  import requests
5
  from io import BytesIO
 
6
  import pandas as pd
 
7
  import fitz # PyMuPDF
 
8
  from urllib.parse import urlparse, unquote
 
 
 
 
 
 
9
  import re
 
 
 
 
 
 
 
10
  import logging
11
 
12
  # Set up logging to see everything
 
95
 
96
  def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check=None, top_margin=0, bottom_margin=0):
97
  """Ask an LLM (OpenRouter) to identify headers in the document.
98
+ Returns a list of dicts: {text, page, suggested_level, confidence, body}.
99
  The function sends plain page-line strings to the LLM (including page numbers)
100
+ and asks for a JSON array containing headers with suggested levels and body for the last header.
101
  """
102
  logger.info("=" * 80)
103
  logger.info("STARTING IDENTIFY_HEADERS_WITH_OPENROUTER")
 
141
  y1 = spans[0]['bbox'][3]
142
  # if y0 < top_margin or y1 > (page_height - bottom_margin):
143
  # continue
144
+ text = " ".join(s.get('text','') for s in spans).strip()
145
+ if text:
 
 
146
  # prefix with page for easier mapping back
147
+ lines_for_prompt.append(f"PAGE {pno+1}: {text}")
 
 
 
 
 
148
  lines_on_page += 1
149
 
150
  if lines_on_page > 0:
 
165
  prompt = LLM_prompt + "\n\nLines:\n" + "\n".join(lines_for_prompt)
166
 
167
  logger.debug(f"Full prompt length: {len(prompt)} characters")
 
168
  print("=" * 80)
169
  print("FULL LLM PROMPT:")
170
  print(prompt)
 
179
  logger.error(f"Could not save prompt to file: {e}")
180
 
181
  if not api_key:
 
182
  logger.error("No API key provided")
183
  return []
184
 
185
  url = "https://openrouter.ai/api/v1/chat/completions"
186
 
 
187
  headers = {
188
  "Authorization": f"Bearer {api_key}",
189
  "Content-Type": "application/json",
 
191
  "X-Title": os.getenv("OPENROUTER_X_TITLE", "")
192
  }
193
 
 
194
  logger.info(f"Making request to OpenRouter with model: {model}")
195
  logger.debug(f"Headers (API key masked): { {k: '***' if k == 'Authorization' else v for k, v in headers.items()} }")
196
 
 
197
  body = {
198
  "model": model,
199
  "messages": [
 
206
  ]
207
  }
208
 
 
209
  try:
 
210
  logger.debug(f"Request body (without prompt text): { {k: v if k != 'messages' else '[...prompt...]' for k, v in body.items()} }")
211
 
 
212
  resp = requests.post(
213
  url=url,
214
  headers=headers,
 
219
  resp.raise_for_status()
220
 
221
  resp_text = resp.text
 
222
  print("=" * 80)
223
  print("FULL LLM RESPONSE:")
224
  print(resp_text)
 
226
 
227
  logger.info(f"LLM raw response length: {len(resp_text)}")
228
 
 
229
  try:
230
  with open("llm_debug.json", "w", encoding="utf-8") as fh:
231
  fh.write(resp_text)
 
284
 
285
  if not text_reply:
286
  logger.error("Could not extract text reply from response")
 
287
  print("=" * 80)
288
  print("FAILED TO EXTRACT TEXT REPLY. FULL RESPONSE STRUCTURE:")
289
  print(json.dumps(rj, indent=2))
290
  print("=" * 80)
291
  return []
292
 
 
293
  print("=" * 80)
294
  print("EXTRACTED TEXT REPLY:")
295
  print(text_reply)
 
312
  except json.JSONDecodeError as e:
313
  logger.error(f"Failed to parse JSON: {e}")
314
  logger.error(f"JSON string that failed to parse: {js[:1000]}")
 
315
  try:
 
316
  import re
317
  json_pattern = r'\[\s*\{.*?\}\s*\]'
318
  matches = re.findall(json_pattern, text_reply, re.DOTALL)
 
338
 
339
  # Log parsed results
340
  logger.info(f"Parsed {len(parsed)} header items:")
341
+ for i, obj in enumerate(parsed[:10]):
342
  logger.info(f" Item {i}: {obj}")
343
 
344
  # Normalize parsed entries and return
 
348
  page = int(obj.get('page')) if obj.get('page') else None
349
  level = obj.get('suggested_level')
350
  conf = float(obj.get('confidence') or 0)
351
+ body = obj.get('body', '') # Get body content, default to empty string
352
+
353
  if t and page is not None:
354
+ out.append({
355
+ 'text': t,
356
+ 'page': page-1,
357
+ 'suggested_level': level,
358
+ 'confidence': conf,
359
+ 'body': body # Add body to output
360
+ })
361
+
362
+ logger.info(f"Returning {len(out)} valid header entries with body content for last header")
363
+
364
+ # Log which entries have body content
365
+ for i, item in enumerate(out):
366
+ if item.get('body'):
367
+ logger.info(f"Entry {i} has body content (length: {len(item['body'])})")
368
 
 
369
  return out
370
 
371
 
 
390
  logger.info("DataFrame head:")
391
  logger.info(df.head().to_string())
392
 
393
+ # Check which rows have body content
394
+ bodies = df['body'].tolist()
395
+ non_empty_bodies = [b for b in bodies if b and str(b).strip()]
396
+ logger.info(f"Found {len(non_empty_bodies)} entries with body content")
397
+
398
  # Save Excel to a file on disk
399
  output_path = "output.xlsx"
400
  try:
401
  df.to_excel(output_path, index=False, engine='openpyxl')
402
  logger.info(f"Excel file saved successfully to: {output_path}")
403
 
 
404
  if os.path.exists(output_path):
405
  file_size = os.path.getsize(output_path)
406
  logger.info(f"Output file exists, size: {file_size} bytes")
 
421
  gr.Textbox(label="LLM Prompt")
422
  ],
423
  outputs = gr.File(file_count="single", label="Download Excel")
 
424
  )
425
 
426
  if __name__ == "__main__":