marcosremar2 commited on
Commit
f32ce28
·
1 Parent(s): 6056d2d

Fix API response to include file content and improve file handling

Browse files
Files changed (1) hide show
  1. app.py +92 -11
app.py CHANGED
@@ -15,9 +15,15 @@ app = Flask(__name__)
15
  CORS(app)
16
 
17
  # Configure logging
18
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 
 
 
 
 
 
 
19
  logger = logging.getLogger()
20
- logger.addHandler(logging.StreamHandler(sys.stdout))
21
 
22
  # Constants
23
  UPLOAD_FOLDER = os.environ.get('UPLOAD_FOLDER', '/tmp/pdf_uploads')
@@ -401,7 +407,14 @@ def convert_pdf():
401
  markdown_path = os.path.join(session_dir, f"{base_filename}.md")
402
  json_path = os.path.join(session_dir, f"{base_filename}.json")
403
 
404
- # Define file URLs
 
 
 
 
 
 
 
405
  markdown_url = f"/download/{session_id}/{base_filename}.md"
406
  json_url = f"/download/{session_id}/{base_filename}.json"
407
 
@@ -465,31 +478,70 @@ def convert_pdf():
465
  "command": " ".join(cmd)
466
  }), 500
467
 
468
- # Check if output files exist
469
  md_exists = os.path.exists(markdown_path)
470
  json_exists = os.path.exists(json_path)
471
 
472
- # If files don't exist in the expected location, try to find them in the session directory
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
473
  if not md_exists:
474
- # Look for any markdown file in the session directory
475
  md_files = [f for f in os.listdir(session_dir) if f.endswith('.md')]
476
  if md_files:
477
  markdown_path = os.path.join(session_dir, md_files[0])
478
  md_exists = True
479
- logging.info(f"Found markdown file at alternate location: {markdown_path}")
480
 
481
  if not json_exists:
482
- # Look for any JSON file in the session directory
483
  json_files = [f for f in os.listdir(session_dir) if f.endswith('.json')]
484
  if json_files:
485
  json_path = os.path.join(session_dir, json_files[0])
486
  json_exists = True
487
  logging.info(f"Found JSON file at alternate location: {json_path}")
488
 
489
- if not md_exists:
 
 
 
 
490
  logging.warning(f"Markdown file not found at {markdown_path}")
491
 
492
- if not json_exists:
 
 
 
 
493
  logging.warning(f"JSON file not found at {json_path}")
494
 
495
  return jsonify({
@@ -497,6 +549,8 @@ def convert_pdf():
497
  "session_id": session_id,
498
  "markdown_url": markdown_url if md_exists else None,
499
  "json_url": json_url if json_exists else None,
 
 
500
  "message": "PDF conversion completed"
501
  })
502
 
@@ -530,14 +584,41 @@ def convert_pdf():
530
  def download_file(session_id, filename):
531
  # Validate the session ID and filename
532
  if not all(c.isalnum() or c == '-' for c in session_id):
 
533
  abort(400, "Invalid session ID")
534
 
 
 
 
 
 
 
535
  base_path = os.path.join(OUTPUT_FOLDER, session_id)
536
  file_path = os.path.join(base_path, filename)
537
 
 
538
  if not os.path.exists(file_path):
539
- abort(404, "File not found")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
540
 
 
541
  return send_file(file_path, as_attachment=True)
542
 
543
  if __name__ == '__main__':
 
15
  CORS(app)
16
 
17
  # Configure logging
18
+ logging.basicConfig(
19
+ level=logging.INFO,
20
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
21
+ handlers=[
22
+ logging.StreamHandler(sys.stdout),
23
+ logging.FileHandler('/tmp/mineruapi.log')
24
+ ]
25
+ )
26
  logger = logging.getLogger()
 
27
 
28
  # Constants
29
  UPLOAD_FOLDER = os.environ.get('UPLOAD_FOLDER', '/tmp/pdf_uploads')
 
407
  markdown_path = os.path.join(session_dir, f"{base_filename}.md")
408
  json_path = os.path.join(session_dir, f"{base_filename}.json")
409
 
410
+ # Define alternate locations to search for generated files
411
+ alternate_locations = [
412
+ os.path.dirname(input_path), # Check PDF directory
413
+ os.path.join(OUTPUT_FOLDER, base_filename), # Check output folder with base filename
414
+ session_dir # Default session directory
415
+ ]
416
+
417
+ # URLs for file download
418
  markdown_url = f"/download/{session_id}/{base_filename}.md"
419
  json_url = f"/download/{session_id}/{base_filename}.json"
420
 
 
478
  "command": " ".join(cmd)
479
  }), 500
480
 
481
+ # Check if files were generated
482
  md_exists = os.path.exists(markdown_path)
483
  json_exists = os.path.exists(json_path)
484
 
485
+ # Search in alternate locations if files not found
486
+ if not md_exists or not json_exists:
487
+ logging.info("Files not found in primary location, searching alternate locations...")
488
+
489
+ for location in alternate_locations:
490
+ if not md_exists:
491
+ alt_md_path = os.path.join(location, f"{base_filename}.md")
492
+ if os.path.exists(alt_md_path):
493
+ # Copy to session directory for download access
494
+ try:
495
+ shutil.copy(alt_md_path, markdown_path)
496
+ md_exists = True
497
+ logging.info(f"Found and copied Markdown file from {alt_md_path}")
498
+ except:
499
+ # If copy fails, update path to point to original
500
+ markdown_path = alt_md_path
501
+ md_exists = True
502
+ logging.info(f"Found Markdown file at alternate location: {alt_md_path}")
503
+
504
+ if not json_exists:
505
+ alt_json_path = os.path.join(location, f"{base_filename}.json")
506
+ if os.path.exists(alt_json_path):
507
+ # Copy to session directory for download access
508
+ try:
509
+ shutil.copy(alt_json_path, json_path)
510
+ json_exists = True
511
+ logging.info(f"Found and copied JSON file from {alt_json_path}")
512
+ except:
513
+ # If copy fails, update path to point to original
514
+ json_path = alt_json_path
515
+ json_exists = True
516
+ logging.info(f"Found JSON file at alternate location: {alt_json_path}")
517
+
518
+ # If still not found, check for any markdown or JSON files in the directory
519
  if not md_exists:
 
520
  md_files = [f for f in os.listdir(session_dir) if f.endswith('.md')]
521
  if md_files:
522
  markdown_path = os.path.join(session_dir, md_files[0])
523
  md_exists = True
524
+ logging.info(f"Found Markdown file at alternate location: {markdown_path}")
525
 
526
  if not json_exists:
 
527
  json_files = [f for f in os.listdir(session_dir) if f.endswith('.json')]
528
  if json_files:
529
  json_path = os.path.join(session_dir, json_files[0])
530
  json_exists = True
531
  logging.info(f"Found JSON file at alternate location: {json_path}")
532
 
533
+ if md_exists:
534
+ with open(markdown_path, 'r', encoding='utf-8') as f:
535
+ markdown_content = f.read()
536
+ else:
537
+ markdown_content = None
538
  logging.warning(f"Markdown file not found at {markdown_path}")
539
 
540
+ if json_exists:
541
+ with open(json_path, 'r', encoding='utf-8') as f:
542
+ json_content = json.load(f)
543
+ else:
544
+ json_content = None
545
  logging.warning(f"JSON file not found at {json_path}")
546
 
547
  return jsonify({
 
549
  "session_id": session_id,
550
  "markdown_url": markdown_url if md_exists else None,
551
  "json_url": json_url if json_exists else None,
552
+ "markdown_content": markdown_content,
553
+ "json_content": json_content,
554
  "message": "PDF conversion completed"
555
  })
556
 
 
584
  def download_file(session_id, filename):
585
  # Validate the session ID and filename
586
  if not all(c.isalnum() or c == '-' for c in session_id):
587
+ logging.warning(f"Invalid session ID format: {session_id}")
588
  abort(400, "Invalid session ID")
589
 
590
+ # Sanitize filename
591
+ if not all(c.isalnum() or c in ['-', '_', '.'] for c in filename):
592
+ logging.warning(f"Invalid filename format: {filename}")
593
+ abort(400, "Invalid filename")
594
+
595
+ # First try the primary location
596
  base_path = os.path.join(OUTPUT_FOLDER, session_id)
597
  file_path = os.path.join(base_path, filename)
598
 
599
+ # If not found, try alternate locations
600
  if not os.path.exists(file_path):
601
+ logging.info(f"File not found at primary location: {file_path}")
602
+
603
+ # Try searching in other common locations
604
+ alternate_locations = [
605
+ OUTPUT_FOLDER,
606
+ os.path.join(OUTPUT_FOLDER, os.path.splitext(filename)[0]),
607
+ os.path.join(UPLOAD_FOLDER, session_id),
608
+ os.path.join(tempfile.gettempdir(), f"minerupdf_{session_id}")
609
+ ]
610
+
611
+ for location in alternate_locations:
612
+ alt_path = os.path.join(location, filename)
613
+ if os.path.exists(alt_path):
614
+ logging.info(f"Found file at alternate location: {alt_path}")
615
+ file_path = alt_path
616
+ break
617
+ else:
618
+ logging.warning(f"File not found in any location: {filename}")
619
+ abort(404, "File not found")
620
 
621
+ logging.info(f"Serving file: {file_path}")
622
  return send_file(file_path, as_attachment=True)
623
 
624
  if __name__ == '__main__':