Spaces:
Runtime error
Runtime error
Commit ·
f32ce28
1
Parent(s): 6056d2d
Fix API response to include file content and improve file handling
Browse files
app.py
CHANGED
|
@@ -15,9 +15,15 @@ app = Flask(__name__)
|
|
| 15 |
CORS(app)
|
| 16 |
|
| 17 |
# Configure logging
|
| 18 |
-
logging.basicConfig(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
logger = logging.getLogger()
|
| 20 |
-
logger.addHandler(logging.StreamHandler(sys.stdout))
|
| 21 |
|
| 22 |
# Constants
|
| 23 |
UPLOAD_FOLDER = os.environ.get('UPLOAD_FOLDER', '/tmp/pdf_uploads')
|
|
@@ -401,7 +407,14 @@ def convert_pdf():
|
|
| 401 |
markdown_path = os.path.join(session_dir, f"{base_filename}.md")
|
| 402 |
json_path = os.path.join(session_dir, f"{base_filename}.json")
|
| 403 |
|
| 404 |
-
# Define
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
markdown_url = f"/download/{session_id}/{base_filename}.md"
|
| 406 |
json_url = f"/download/{session_id}/{base_filename}.json"
|
| 407 |
|
|
@@ -465,31 +478,70 @@ def convert_pdf():
|
|
| 465 |
"command": " ".join(cmd)
|
| 466 |
}), 500
|
| 467 |
|
| 468 |
-
# Check if
|
| 469 |
md_exists = os.path.exists(markdown_path)
|
| 470 |
json_exists = os.path.exists(json_path)
|
| 471 |
|
| 472 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 473 |
if not md_exists:
|
| 474 |
-
# Look for any markdown file in the session directory
|
| 475 |
md_files = [f for f in os.listdir(session_dir) if f.endswith('.md')]
|
| 476 |
if md_files:
|
| 477 |
markdown_path = os.path.join(session_dir, md_files[0])
|
| 478 |
md_exists = True
|
| 479 |
-
logging.info(f"Found
|
| 480 |
|
| 481 |
if not json_exists:
|
| 482 |
-
# Look for any JSON file in the session directory
|
| 483 |
json_files = [f for f in os.listdir(session_dir) if f.endswith('.json')]
|
| 484 |
if json_files:
|
| 485 |
json_path = os.path.join(session_dir, json_files[0])
|
| 486 |
json_exists = True
|
| 487 |
logging.info(f"Found JSON file at alternate location: {json_path}")
|
| 488 |
|
| 489 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
| 490 |
logging.warning(f"Markdown file not found at {markdown_path}")
|
| 491 |
|
| 492 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
| 493 |
logging.warning(f"JSON file not found at {json_path}")
|
| 494 |
|
| 495 |
return jsonify({
|
|
@@ -497,6 +549,8 @@ def convert_pdf():
|
|
| 497 |
"session_id": session_id,
|
| 498 |
"markdown_url": markdown_url if md_exists else None,
|
| 499 |
"json_url": json_url if json_exists else None,
|
|
|
|
|
|
|
| 500 |
"message": "PDF conversion completed"
|
| 501 |
})
|
| 502 |
|
|
@@ -530,14 +584,41 @@ def convert_pdf():
|
|
| 530 |
def download_file(session_id, filename):
|
| 531 |
# Validate the session ID and filename
|
| 532 |
if not all(c.isalnum() or c == '-' for c in session_id):
|
|
|
|
| 533 |
abort(400, "Invalid session ID")
|
| 534 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 535 |
base_path = os.path.join(OUTPUT_FOLDER, session_id)
|
| 536 |
file_path = os.path.join(base_path, filename)
|
| 537 |
|
|
|
|
| 538 |
if not os.path.exists(file_path):
|
| 539 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 540 |
|
|
|
|
| 541 |
return send_file(file_path, as_attachment=True)
|
| 542 |
|
| 543 |
if __name__ == '__main__':
|
|
|
|
| 15 |
CORS(app)
|
| 16 |
|
| 17 |
# Configure logging
|
| 18 |
+
logging.basicConfig(
|
| 19 |
+
level=logging.INFO,
|
| 20 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 21 |
+
handlers=[
|
| 22 |
+
logging.StreamHandler(sys.stdout),
|
| 23 |
+
logging.FileHandler('/tmp/mineruapi.log')
|
| 24 |
+
]
|
| 25 |
+
)
|
| 26 |
logger = logging.getLogger()
|
|
|
|
| 27 |
|
| 28 |
# Constants
|
| 29 |
UPLOAD_FOLDER = os.environ.get('UPLOAD_FOLDER', '/tmp/pdf_uploads')
|
|
|
|
| 407 |
markdown_path = os.path.join(session_dir, f"{base_filename}.md")
|
| 408 |
json_path = os.path.join(session_dir, f"{base_filename}.json")
|
| 409 |
|
| 410 |
+
# Define alternate locations to search for generated files
|
| 411 |
+
alternate_locations = [
|
| 412 |
+
os.path.dirname(input_path), # Check PDF directory
|
| 413 |
+
os.path.join(OUTPUT_FOLDER, base_filename), # Check output folder with base filename
|
| 414 |
+
session_dir # Default session directory
|
| 415 |
+
]
|
| 416 |
+
|
| 417 |
+
# URLs for file download
|
| 418 |
markdown_url = f"/download/{session_id}/{base_filename}.md"
|
| 419 |
json_url = f"/download/{session_id}/{base_filename}.json"
|
| 420 |
|
|
|
|
| 478 |
"command": " ".join(cmd)
|
| 479 |
}), 500
|
| 480 |
|
| 481 |
+
# Check if files were generated
|
| 482 |
md_exists = os.path.exists(markdown_path)
|
| 483 |
json_exists = os.path.exists(json_path)
|
| 484 |
|
| 485 |
+
# Search in alternate locations if files not found
|
| 486 |
+
if not md_exists or not json_exists:
|
| 487 |
+
logging.info("Files not found in primary location, searching alternate locations...")
|
| 488 |
+
|
| 489 |
+
for location in alternate_locations:
|
| 490 |
+
if not md_exists:
|
| 491 |
+
alt_md_path = os.path.join(location, f"{base_filename}.md")
|
| 492 |
+
if os.path.exists(alt_md_path):
|
| 493 |
+
# Copy to session directory for download access
|
| 494 |
+
try:
|
| 495 |
+
shutil.copy(alt_md_path, markdown_path)
|
| 496 |
+
md_exists = True
|
| 497 |
+
logging.info(f"Found and copied Markdown file from {alt_md_path}")
|
| 498 |
+
except:
|
| 499 |
+
# If copy fails, update path to point to original
|
| 500 |
+
markdown_path = alt_md_path
|
| 501 |
+
md_exists = True
|
| 502 |
+
logging.info(f"Found Markdown file at alternate location: {alt_md_path}")
|
| 503 |
+
|
| 504 |
+
if not json_exists:
|
| 505 |
+
alt_json_path = os.path.join(location, f"{base_filename}.json")
|
| 506 |
+
if os.path.exists(alt_json_path):
|
| 507 |
+
# Copy to session directory for download access
|
| 508 |
+
try:
|
| 509 |
+
shutil.copy(alt_json_path, json_path)
|
| 510 |
+
json_exists = True
|
| 511 |
+
logging.info(f"Found and copied JSON file from {alt_json_path}")
|
| 512 |
+
except:
|
| 513 |
+
# If copy fails, update path to point to original
|
| 514 |
+
json_path = alt_json_path
|
| 515 |
+
json_exists = True
|
| 516 |
+
logging.info(f"Found JSON file at alternate location: {alt_json_path}")
|
| 517 |
+
|
| 518 |
+
# If still not found, check for any markdown or JSON files in the directory
|
| 519 |
if not md_exists:
|
|
|
|
| 520 |
md_files = [f for f in os.listdir(session_dir) if f.endswith('.md')]
|
| 521 |
if md_files:
|
| 522 |
markdown_path = os.path.join(session_dir, md_files[0])
|
| 523 |
md_exists = True
|
| 524 |
+
logging.info(f"Found Markdown file at alternate location: {markdown_path}")
|
| 525 |
|
| 526 |
if not json_exists:
|
|
|
|
| 527 |
json_files = [f for f in os.listdir(session_dir) if f.endswith('.json')]
|
| 528 |
if json_files:
|
| 529 |
json_path = os.path.join(session_dir, json_files[0])
|
| 530 |
json_exists = True
|
| 531 |
logging.info(f"Found JSON file at alternate location: {json_path}")
|
| 532 |
|
| 533 |
+
if md_exists:
|
| 534 |
+
with open(markdown_path, 'r', encoding='utf-8') as f:
|
| 535 |
+
markdown_content = f.read()
|
| 536 |
+
else:
|
| 537 |
+
markdown_content = None
|
| 538 |
logging.warning(f"Markdown file not found at {markdown_path}")
|
| 539 |
|
| 540 |
+
if json_exists:
|
| 541 |
+
with open(json_path, 'r', encoding='utf-8') as f:
|
| 542 |
+
json_content = json.load(f)
|
| 543 |
+
else:
|
| 544 |
+
json_content = None
|
| 545 |
logging.warning(f"JSON file not found at {json_path}")
|
| 546 |
|
| 547 |
return jsonify({
|
|
|
|
| 549 |
"session_id": session_id,
|
| 550 |
"markdown_url": markdown_url if md_exists else None,
|
| 551 |
"json_url": json_url if json_exists else None,
|
| 552 |
+
"markdown_content": markdown_content,
|
| 553 |
+
"json_content": json_content,
|
| 554 |
"message": "PDF conversion completed"
|
| 555 |
})
|
| 556 |
|
|
|
|
| 584 |
def download_file(session_id, filename):
|
| 585 |
# Validate the session ID and filename
|
| 586 |
if not all(c.isalnum() or c == '-' for c in session_id):
|
| 587 |
+
logging.warning(f"Invalid session ID format: {session_id}")
|
| 588 |
abort(400, "Invalid session ID")
|
| 589 |
|
| 590 |
+
# Sanitize filename
|
| 591 |
+
if not all(c.isalnum() or c in ['-', '_', '.'] for c in filename):
|
| 592 |
+
logging.warning(f"Invalid filename format: {filename}")
|
| 593 |
+
abort(400, "Invalid filename")
|
| 594 |
+
|
| 595 |
+
# First try the primary location
|
| 596 |
base_path = os.path.join(OUTPUT_FOLDER, session_id)
|
| 597 |
file_path = os.path.join(base_path, filename)
|
| 598 |
|
| 599 |
+
# If not found, try alternate locations
|
| 600 |
if not os.path.exists(file_path):
|
| 601 |
+
logging.info(f"File not found at primary location: {file_path}")
|
| 602 |
+
|
| 603 |
+
# Try searching in other common locations
|
| 604 |
+
alternate_locations = [
|
| 605 |
+
OUTPUT_FOLDER,
|
| 606 |
+
os.path.join(OUTPUT_FOLDER, os.path.splitext(filename)[0]),
|
| 607 |
+
os.path.join(UPLOAD_FOLDER, session_id),
|
| 608 |
+
os.path.join(tempfile.gettempdir(), f"minerupdf_{session_id}")
|
| 609 |
+
]
|
| 610 |
+
|
| 611 |
+
for location in alternate_locations:
|
| 612 |
+
alt_path = os.path.join(location, filename)
|
| 613 |
+
if os.path.exists(alt_path):
|
| 614 |
+
logging.info(f"Found file at alternate location: {alt_path}")
|
| 615 |
+
file_path = alt_path
|
| 616 |
+
break
|
| 617 |
+
else:
|
| 618 |
+
logging.warning(f"File not found in any location: {filename}")
|
| 619 |
+
abort(404, "File not found")
|
| 620 |
|
| 621 |
+
logging.info(f"Serving file: {file_path}")
|
| 622 |
return send_file(file_path, as_attachment=True)
|
| 623 |
|
| 624 |
if __name__ == '__main__':
|