rawanessam commited on
Commit
4ccce7a
·
verified ·
1 Parent(s): 8d70086

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +227 -46
app.py CHANGED
@@ -23,94 +23,129 @@ import copy
23
  # import tsadropboxretrieval
24
 
25
  import urllib.parse
 
 
 
 
 
 
 
 
 
 
 
26
 
27
-
28
-
29
-
30
 
31
  def get_toc_page_numbers(doc, max_pages_to_check=15):
32
  toc_pages = []
33
-
 
34
  # 1. Existing Dot Pattern (looking for ".....")
35
  dot_pattern = re.compile(r"\.{2,}")
36
-
37
  # 2. NEW: Title Pattern (looking for specific headers)
38
  # ^ and $ ensure the line is JUST that word (ignoring "The contents of the bag...")
39
  # re.IGNORECASE makes it match "CONTENTS", "Contents", "Index", etc.
40
  title_pattern = re.compile(r"^\s*(table of contents|contents|index)\s*$", re.IGNORECASE)
41
-
42
  for page_num in range(min(len(doc), max_pages_to_check)):
43
  page = doc.load_page(page_num)
44
  blocks = page.get_text("dict")["blocks"]
45
-
46
  dot_line_count = 0
47
  has_toc_title = False
48
-
 
 
49
  for block in blocks:
50
  for line in block.get("lines", []):
51
  # Extract text from spans (mimicking get_spaced_text_from_spans)
52
  line_text = " ".join([span["text"] for span in line["spans"]]).strip()
53
-
54
  # CHECK A: Does the line have dots?
55
  if dot_pattern.search(line_text):
56
  dot_line_count += 1
57
-
 
58
  # CHECK B: Is this line a Title?
59
  # We check this early in the loop. If a page has a title "Contents",
60
  # we mark it immediately.
61
  if title_pattern.match(line_text):
62
  has_toc_title = True
63
-
 
64
  # CONDITION:
65
  # It is a TOC page if it has a Title OR if it has dot leaders.
66
  # We use 'dot_line_count >= 1' to be sensitive to single-item lists.
67
  if has_toc_title or dot_line_count >= 1:
68
  toc_pages.append(page_num)
69
-
 
70
  # RETURN:
71
  # If we found TOC pages (e.g., [2, 3]), we return [0, 1, 2, 3]
72
  # This covers the cover page, inside cover, and the TOC itself.
73
  if toc_pages:
74
  last_toc_page = toc_pages[0]
75
- return list(range(0, last_toc_page + 1))
76
-
 
 
 
77
  return [] # Return empty list if nothing found
78
 
79
 
80
  def openPDF(pdf_path):
 
81
  pdf_path = pdf_path.replace('dl=0', 'dl=1')
82
  response = requests.get(pdf_path)
 
83
  pdf_content = BytesIO(response.content)
84
  if not pdf_content:
 
85
  raise ValueError("No valid PDF content found.")
86
 
87
  doc = fitz.open(stream=pdf_content, filetype="pdf")
 
88
  return doc
89
 
90
- def identify_headers_with_openrouter(pdf_path, model,LLM_prompt, pages_to_check=None, top_margin=70, bottom_margin=85):
91
  """Ask an LLM (OpenRouter) to identify headers in the document.
92
-
93
  Returns a list of dicts: {text, page, suggested_level, confidence}.
94
  The function sends plain page-line strings to the LLM (including page numbers)
95
  and asks for a JSON array containing only header lines with suggested levels.
96
  """
97
- doc=openPDF(pdf_path)
98
- api_key='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'
 
 
 
 
 
 
99
  if api_key is None:
100
-
101
  api_key = os.getenv("OPENROUTER_API_KEY") or None
102
- model=str(model)
 
103
  toc_pages = get_toc_page_numbers(doc)
104
  lines_for_prompt = []
105
 
 
 
 
106
  # Collect text lines from pages (skip TOC pages)
 
107
  for pno in range(len(doc)):
108
  if pages_to_check and pno not in pages_to_check:
109
  continue
110
  if pno in toc_pages:
 
111
  continue
 
112
  page = doc.load_page(pno)
113
  page_height = page.rect.height
 
 
114
  for block in page.get_text("dict").get('blocks', []):
115
  if block.get('type') != 0:
116
  continue
@@ -126,20 +161,47 @@ def identify_headers_with_openrouter(pdf_path, model,LLM_prompt, pages_to_check=
126
  if text:
127
  # prefix with page for easier mapping back
128
  lines_for_prompt.append(f"PAGE {pno+1}: {text}")
129
-
 
 
 
 
 
 
 
130
  if not lines_for_prompt:
 
131
  return []
132
-
133
- prompt = (
134
- LLM_prompt + "\n\nLines:\n" + "\n".join(lines_for_prompt)
135
- )
136
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  if not api_key:
138
  # No API key: return empty so caller can fallback to heuristics
 
139
  return []
140
-
141
  url = "https://openrouter.ai/api/v1/chat/completions"
142
-
143
  # Build headers following the OpenRouter example
144
  headers = {
145
  "Authorization": f"Bearer {api_key}",
@@ -147,7 +209,11 @@ def identify_headers_with_openrouter(pdf_path, model,LLM_prompt, pages_to_check=
147
  "HTTP-Referer": os.getenv("OPENROUTER_REFERER", ""),
148
  "X-Title": os.getenv("OPENROUTER_X_TITLE", "")
149
  }
150
-
 
 
 
 
151
  # Wrap the prompt as the example 'content' array expected by OpenRouter
152
  body = {
153
  "model": model,
@@ -160,66 +226,151 @@ def identify_headers_with_openrouter(pdf_path, model,LLM_prompt, pages_to_check=
160
  }
161
  ]
162
  }
163
-
164
  # Debug: log request body (truncated) and write raw response for inspection
165
  try:
166
- print("LLM request (truncated):", prompt[:1000])
 
 
 
167
  resp = requests.post(
168
  url=url,
169
  headers=headers,
170
- data=json.dumps(body),
171
-
172
  )
 
 
173
  resp.raise_for_status()
 
174
  resp_text = resp.text
175
- print("LLM raw response length:", len(resp_text))
 
 
 
 
 
 
 
176
  # Save raw response for offline inspection
177
  try:
178
  with open("llm_debug.json", "w", encoding="utf-8") as fh:
179
  fh.write(resp_text)
 
180
  except Exception as e:
181
- print("Warning: could not write llm_debug.json:", e)
 
182
  rj = resp.json()
183
- print("LLM parsed response keys:", list(rj.keys()) if isinstance(rj, dict) else type(rj))
 
 
 
 
 
 
184
  except Exception as e:
185
- print("LLM call failed:", repr(e))
186
  return []
187
-
188
  # Extract textual reply robustly
189
  text_reply = None
190
  if isinstance(rj, dict):
191
  choices = rj.get('choices') or []
 
 
192
  if choices:
 
 
 
193
  c0 = choices[0]
194
  msg = c0.get('message') or c0.get('delta') or {}
195
  content = msg.get('content')
 
196
  if isinstance(content, list):
197
- for c in content:
 
198
  if c.get('type') == 'text' and c.get('text'):
199
  text_reply = c.get('text')
 
200
  break
201
  elif isinstance(content, str):
202
  text_reply = content
 
203
  elif isinstance(msg, dict) and msg.get('content') and isinstance(msg.get('content'), dict):
204
  text_reply = msg.get('content').get('text')
 
 
 
205
  if not text_reply:
 
206
  for c in rj.get('choices', []):
207
  if isinstance(c.get('text'), str):
208
  text_reply = c.get('text')
 
209
  break
210
-
211
  if not text_reply:
 
 
 
 
 
 
212
  return []
213
-
 
 
 
 
 
 
 
 
 
214
  s = text_reply.strip()
215
  start = s.find('[')
216
  end = s.rfind(']')
217
  js = s[start:end+1] if start != -1 and end != -1 else s
 
 
 
 
218
  try:
219
  parsed = json.loads(js)
220
- except Exception:
221
- return []
222
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  # Normalize parsed entries and return
224
  out = []
225
  for obj in parsed:
@@ -229,21 +380,48 @@ def identify_headers_with_openrouter(pdf_path, model,LLM_prompt, pages_to_check=
229
  conf = float(obj.get('confidence') or 0)
230
  if t and page is not None:
231
  out.append({'text': t, 'page': page-1, 'suggested_level': level, 'confidence': conf})
 
 
232
  return out
233
 
234
 
235
  def identify_headers_and_save_excel(pdf_path, model, llm_prompt):
 
 
 
 
236
  # Call your existing function
237
  result = identify_headers_with_openrouter(pdf_path, model, llm_prompt)
238
 
239
  if not result:
 
240
  return None
241
 
 
242
  df = pd.DataFrame(result)
243
 
 
 
 
 
 
 
244
  # Save Excel to a file on disk
245
  output_path = "output.xlsx"
246
- df.to_excel(output_path, index=False, engine='openpyxl')
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
  return output_path # return file path, not BytesIO
249
 
@@ -257,4 +435,7 @@ iface = gr.Interface(
257
  outputs=gr.File(label="Download Excel") # File expects a path
258
  )
259
 
260
- iface.launch()
 
 
 
 
23
  # import tsadropboxretrieval
24
 
25
  import urllib.parse
26
+ import logging
27
+
28
+ # Set up logging to see everything
29
+ logging.basicConfig(
30
+ level=logging.DEBUG,
31
+ format='%(asctime)s - %(levelname)s - %(message)s',
32
+ handlers=[
33
+ logging.StreamHandler(), # Print to console
34
+ logging.FileHandler('debug.log', mode='w') # Save to file
35
+ ]
36
+ )
37
 
38
+ logger = logging.getLogger(__name__)
 
 
39
 
40
  def get_toc_page_numbers(doc, max_pages_to_check=15):
41
  toc_pages = []
42
+
43
+ logger.debug(f"Starting TOC detection, checking first {max_pages_to_check} pages")
44
  # 1. Existing Dot Pattern (looking for ".....")
45
  dot_pattern = re.compile(r"\.{2,}")
46
+
47
  # 2. NEW: Title Pattern (looking for specific headers)
48
  # ^ and $ ensure the line is JUST that word (ignoring "The contents of the bag...")
49
  # re.IGNORECASE makes it match "CONTENTS", "Contents", "Index", etc.
50
  title_pattern = re.compile(r"^\s*(table of contents|contents|index)\s*$", re.IGNORECASE)
51
+
52
  for page_num in range(min(len(doc), max_pages_to_check)):
53
  page = doc.load_page(page_num)
54
  blocks = page.get_text("dict")["blocks"]
55
+
56
  dot_line_count = 0
57
  has_toc_title = False
58
+
59
+ logger.debug(f"Checking page {page_num} for TOC")
60
+
61
  for block in blocks:
62
  for line in block.get("lines", []):
63
  # Extract text from spans (mimicking get_spaced_text_from_spans)
64
  line_text = " ".join([span["text"] for span in line["spans"]]).strip()
65
+
66
  # CHECK A: Does the line have dots?
67
  if dot_pattern.search(line_text):
68
  dot_line_count += 1
69
+ logger.debug(f" Found dot pattern on page {page_num}: '{line_text[:50]}...'")
70
+
71
  # CHECK B: Is this line a Title?
72
  # We check this early in the loop. If a page has a title "Contents",
73
  # we mark it immediately.
74
  if title_pattern.match(line_text):
75
  has_toc_title = True
76
+ logger.debug(f" Found TOC title on page {page_num}: '{line_text}'")
77
+
78
  # CONDITION:
79
  # It is a TOC page if it has a Title OR if it has dot leaders.
80
  # We use 'dot_line_count >= 1' to be sensitive to single-item lists.
81
  if has_toc_title or dot_line_count >= 1:
82
  toc_pages.append(page_num)
83
+ logger.info(f"Page {page_num} identified as TOC page")
84
+
85
  # RETURN:
86
  # If we found TOC pages (e.g., [2, 3]), we return [0, 1, 2, 3]
87
  # This covers the cover page, inside cover, and the TOC itself.
88
  if toc_pages:
89
  last_toc_page = toc_pages[0]
90
+ result = list(range(0, last_toc_page + 1))
91
+ logger.info(f"TOC pages found: {result}")
92
+ return result
93
+
94
+ logger.info("No TOC pages found")
95
  return [] # Return empty list if nothing found
96
 
97
 
98
  def openPDF(pdf_path):
99
+ logger.info(f"Opening PDF from URL: {pdf_path}")
100
  pdf_path = pdf_path.replace('dl=0', 'dl=1')
101
  response = requests.get(pdf_path)
102
+ logger.debug(f"PDF download response status: {response.status_code}")
103
  pdf_content = BytesIO(response.content)
104
  if not pdf_content:
105
+ logger.error("No valid PDF content found.")
106
  raise ValueError("No valid PDF content found.")
107
 
108
  doc = fitz.open(stream=pdf_content, filetype="pdf")
109
+ logger.info(f"PDF opened successfully, {len(doc)} pages")
110
  return doc
111
 
112
+ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check=None, top_margin=70, bottom_margin=85):
113
  """Ask an LLM (OpenRouter) to identify headers in the document.
 
114
  Returns a list of dicts: {text, page, suggested_level, confidence}.
115
  The function sends plain page-line strings to the LLM (including page numbers)
116
  and asks for a JSON array containing only header lines with suggested levels.
117
  """
118
+ logger.info("=" * 80)
119
+ logger.info("STARTING IDENTIFY_HEADERS_WITH_OPENROUTER")
120
+ logger.info(f"PDF Path: {pdf_path}")
121
+ logger.info(f"Model: {model}")
122
+ logger.info(f"LLM Prompt: {LLM_prompt[:200]}..." if len(LLM_prompt) > 200 else f"LLM Prompt: {LLM_prompt}")
123
+
124
+ doc = openPDF(pdf_path)
125
+ api_key = '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'
126
  if api_key is None:
 
127
  api_key = os.getenv("OPENROUTER_API_KEY") or None
128
+
129
+ model = str(model)
130
  toc_pages = get_toc_page_numbers(doc)
131
  lines_for_prompt = []
132
 
133
+ logger.info(f"TOC pages to skip: {toc_pages}")
134
+ logger.info(f"Total pages in document: {len(doc)}")
135
+
136
  # Collect text lines from pages (skip TOC pages)
137
+ total_lines = 0
138
  for pno in range(len(doc)):
139
  if pages_to_check and pno not in pages_to_check:
140
  continue
141
  if pno in toc_pages:
142
+ logger.debug(f"Skipping TOC page {pno}")
143
  continue
144
+
145
  page = doc.load_page(pno)
146
  page_height = page.rect.height
147
+ lines_on_page = 0
148
+
149
  for block in page.get_text("dict").get('blocks', []):
150
  if block.get('type') != 0:
151
  continue
 
161
  if text:
162
  # prefix with page for easier mapping back
163
  lines_for_prompt.append(f"PAGE {pno+1}: {text}")
164
+ lines_on_page += 1
165
+
166
+ if lines_on_page > 0:
167
+ logger.debug(f"Page {pno}: collected {lines_on_page} lines")
168
+ total_lines += lines_on_page
169
+
170
+ logger.info(f"Total lines collected for LLM: {total_lines}")
171
+
172
  if not lines_for_prompt:
173
+ logger.warning("No lines collected for prompt")
174
  return []
175
+
176
+ # Log sample of lines
177
+ logger.info("Sample lines (first 10):")
178
+ for i, line in enumerate(lines_for_prompt[:10]):
179
+ logger.info(f" {i}: {line}")
180
+
181
+ prompt = LLM_prompt + "\n\nLines:\n" + "\n".join(lines_for_prompt)
182
+
183
+ logger.debug(f"Full prompt length: {len(prompt)} characters")
184
+ # Changed: Print entire prompt, not truncated
185
+ print("=" * 80)
186
+ print("FULL LLM PROMPT:")
187
+ print(prompt)
188
+ print("=" * 80)
189
+
190
+ # Also log to file
191
+ try:
192
+ with open("full_prompt.txt", "w", encoding="utf-8") as f:
193
+ f.write(prompt)
194
+ logger.info("Full prompt saved to full_prompt.txt")
195
+ except Exception as e:
196
+ logger.error(f"Could not save prompt to file: {e}")
197
+
198
  if not api_key:
199
  # No API key: return empty so caller can fallback to heuristics
200
+ logger.error("No API key provided")
201
  return []
202
+
203
  url = "https://openrouter.ai/api/v1/chat/completions"
204
+
205
  # Build headers following the OpenRouter example
206
  headers = {
207
  "Authorization": f"Bearer {api_key}",
 
209
  "HTTP-Referer": os.getenv("OPENROUTER_REFERER", ""),
210
  "X-Title": os.getenv("OPENROUTER_X_TITLE", "")
211
  }
212
+
213
+ # Log request details (without exposing full API key)
214
+ logger.info(f"Making request to OpenRouter with model: {model}")
215
+ logger.debug(f"Headers (API key masked): { {k: '***' if k == 'Authorization' else v for k, v in headers.items()} }")
216
+
217
  # Wrap the prompt as the example 'content' array expected by OpenRouter
218
  body = {
219
  "model": model,
 
226
  }
227
  ]
228
  }
229
+
230
  # Debug: log request body (truncated) and write raw response for inspection
231
  try:
232
+ # Changed: Log full body (excluding prompt text which is already logged)
233
+ logger.debug(f"Request body (without prompt text): { {k: v if k != 'messages' else '[...prompt...]' for k, v in body.items()} }")
234
+
235
+ # Removed timeout parameter
236
  resp = requests.post(
237
  url=url,
238
  headers=headers,
239
+ data=json.dumps(body)
 
240
  )
241
+
242
+ logger.info(f"HTTP Response Status: {resp.status_code}")
243
  resp.raise_for_status()
244
+
245
  resp_text = resp.text
246
+ # Changed: Print entire response
247
+ print("=" * 80)
248
+ print("FULL LLM RESPONSE:")
249
+ print(resp_text)
250
+ print("=" * 80)
251
+
252
+ logger.info(f"LLM raw response length: {len(resp_text)}")
253
+
254
  # Save raw response for offline inspection
255
  try:
256
  with open("llm_debug.json", "w", encoding="utf-8") as fh:
257
  fh.write(resp_text)
258
+ logger.info("Raw response saved to llm_debug.json")
259
  except Exception as e:
260
+ logger.error(f"Warning: could not write llm_debug.json: {e}")
261
+
262
  rj = resp.json()
263
+ logger.info(f"LLM parsed response type: {type(rj)}")
264
+ if isinstance(rj, dict):
265
+ logger.debug(f"Response keys: {list(rj.keys())}")
266
+
267
+ except requests.exceptions.RequestException as e:
268
+ logger.error(f"HTTP request failed: {repr(e)}")
269
+ return []
270
  except Exception as e:
271
+ logger.error(f"LLM call failed: {repr(e)}")
272
  return []
273
+
274
  # Extract textual reply robustly
275
  text_reply = None
276
  if isinstance(rj, dict):
277
  choices = rj.get('choices') or []
278
+ logger.debug(f"Number of choices in response: {len(choices)}")
279
+
280
  if choices:
281
+ for i, c in enumerate(choices):
282
+ logger.debug(f"Choice {i}: {c}")
283
+
284
  c0 = choices[0]
285
  msg = c0.get('message') or c0.get('delta') or {}
286
  content = msg.get('content')
287
+
288
  if isinstance(content, list):
289
+ logger.debug(f"Content is a list with {len(content)} items")
290
+ for idx, c in enumerate(content):
291
  if c.get('type') == 'text' and c.get('text'):
292
  text_reply = c.get('text')
293
+ logger.debug(f"Found text reply in content[{idx}], length: {len(text_reply)}")
294
  break
295
  elif isinstance(content, str):
296
  text_reply = content
297
+ logger.debug(f"Content is string, length: {len(text_reply)}")
298
  elif isinstance(msg, dict) and msg.get('content') and isinstance(msg.get('content'), dict):
299
  text_reply = msg.get('content').get('text')
300
+ logger.debug(f"Found text in nested content dict")
301
+
302
+ # Fallback extraction
303
  if not text_reply:
304
+ logger.debug("Trying fallback extraction from choices")
305
  for c in rj.get('choices', []):
306
  if isinstance(c.get('text'), str):
307
  text_reply = c.get('text')
308
+ logger.debug(f"Found text reply in choice.text, length: {len(text_reply)}")
309
  break
310
+
311
  if not text_reply:
312
+ logger.error("Could not extract text reply from response")
313
+ # Changed: Print the entire response structure for debugging
314
+ print("=" * 80)
315
+ print("FAILED TO EXTRACT TEXT REPLY. FULL RESPONSE STRUCTURE:")
316
+ print(json.dumps(rj, indent=2))
317
+ print("=" * 80)
318
  return []
319
+
320
+ # Changed: Print the extracted text reply
321
+ print("=" * 80)
322
+ print("EXTRACTED TEXT REPLY:")
323
+ print(text_reply)
324
+ print("=" * 80)
325
+
326
+ logger.info(f"Extracted text reply length: {len(text_reply)}")
327
+ logger.debug(f"First 500 chars of reply: {text_reply[:500]}...")
328
+
329
  s = text_reply.strip()
330
  start = s.find('[')
331
  end = s.rfind(']')
332
  js = s[start:end+1] if start != -1 and end != -1 else s
333
+
334
+ logger.debug(f"Looking for JSON array: start={start}, end={end}")
335
+ logger.debug(f"Extracted JSON string (first 500 chars): {js[:500]}...")
336
+
337
  try:
338
  parsed = json.loads(js)
339
+ logger.info(f"Successfully parsed JSON, got {len(parsed)} items")
340
+ except json.JSONDecodeError as e:
341
+ logger.error(f"Failed to parse JSON: {e}")
342
+ logger.error(f"JSON string that failed to parse: {js[:1000]}")
343
+ # Try to find any JSON-like structure
344
+ try:
345
+ # Try to extract any JSON array
346
+ import re
347
+ json_pattern = r'\[\s*\{.*?\}\s*\]'
348
+ matches = re.findall(json_pattern, text_reply, re.DOTALL)
349
+ if matches:
350
+ logger.info(f"Found {len(matches)} potential JSON arrays via regex")
351
+ for i, match in enumerate(matches):
352
+ try:
353
+ parsed = json.loads(match)
354
+ logger.info(f"Successfully parsed regex match {i} with {len(parsed)} items")
355
+ break
356
+ except json.JSONDecodeError as e2:
357
+ logger.debug(f"Regex match {i} also failed: {e2}")
358
+ continue
359
+ else:
360
+ logger.error("All regex matches failed to parse")
361
+ return []
362
+ else:
363
+ logger.error("No JSON-like pattern found via regex")
364
+ return []
365
+ except Exception as e2:
366
+ logger.error(f"Regex extraction also failed: {e2}")
367
+ return []
368
+
369
+ # Log parsed results
370
+ logger.info(f"Parsed {len(parsed)} header items:")
371
+ for i, obj in enumerate(parsed[:10]): # Log first 10 items
372
+ logger.info(f" Item {i}: {obj}")
373
+
374
  # Normalize parsed entries and return
375
  out = []
376
  for obj in parsed:
 
380
  conf = float(obj.get('confidence') or 0)
381
  if t and page is not None:
382
  out.append({'text': t, 'page': page-1, 'suggested_level': level, 'confidence': conf})
383
+
384
+ logger.info(f"Returning {len(out)} valid header entries")
385
  return out
386
 
387
 
388
  def identify_headers_and_save_excel(pdf_path, model, llm_prompt):
389
+ logger.info("=" * 80)
390
+ logger.info("STARTING IDENTIFY_HEADERS_AND_SAVE_EXCEL")
391
+ logger.info(f"Inputs - PDF: {pdf_path}, Model: {model}")
392
+
393
  # Call your existing function
394
  result = identify_headers_with_openrouter(pdf_path, model, llm_prompt)
395
 
396
  if not result:
397
+ logger.warning("No results returned from identify_headers_with_openrouter")
398
  return None
399
 
400
+ logger.info(f"Got {len(result)} results, creating DataFrame")
401
  df = pd.DataFrame(result)
402
 
403
+ # Log DataFrame info
404
+ logger.info(f"DataFrame shape: {df.shape}")
405
+ logger.info(f"DataFrame columns: {df.columns.tolist()}")
406
+ logger.info("DataFrame head:")
407
+ logger.info(df.head().to_string())
408
+
409
  # Save Excel to a file on disk
410
  output_path = "output.xlsx"
411
+ try:
412
+ df.to_excel(output_path, index=False, engine='openpyxl')
413
+ logger.info(f"Excel file saved successfully to: {output_path}")
414
+
415
+ # Verify file was created
416
+ if os.path.exists(output_path):
417
+ file_size = os.path.getsize(output_path)
418
+ logger.info(f"Output file exists, size: {file_size} bytes")
419
+ else:
420
+ logger.error(f"Output file was not created at: {output_path}")
421
+
422
+ except Exception as e:
423
+ logger.error(f"Failed to save Excel file: {e}")
424
+ return None
425
 
426
  return output_path # return file path, not BytesIO
427
 
 
435
  outputs=gr.File(label="Download Excel") # File expects a path
436
  )
437
 
438
+ if __name__ == "__main__":
439
+ print("Starting Gradio interface...")
440
+ logger.info("Launching Gradio interface")
441
+ iface.launch()