File size: 27,403 Bytes
0e61117 81d196c 56c637d 81d196c 56c637d 0e61117 81d196c 0e61117 a8562ee 0e61117 3cb9a37 0e61117 a8562ee 0e61117 80ac589 4f1c614 3a15d23 7cc3172 5c93ea3 80ac589 0e61117 efbac81 0e61117 efbac81 0e61117 efbac81 0e61117 4ac1f80 3a15d23 4ac1f80 3a15d23 7cc3172 3a15d23 4ac1f80 0e61117 a8562ee 0e61117 a8562ee 0e61117 7f64014 7e0c9d7 7f64014 7e0c9d7 7f64014 abb9aca 07a1661 abb9aca 7e0c9d7 abb9aca 07a1661 7e0c9d7 7f64014 7e0c9d7 7f64014 0e61117 95cf2e4 0e61117 95cf2e4 0e61117 57a6561 a8562ee 57a6561 a8562ee 57a6561 a8562ee 57a6561 a8562ee 57a6561 0e61117 2c9a398 586c545 2c9a398 586c545 2c9a398 586c545 2c9a398 586c545 2c9a398 586c545 2c9a398 586c545 2c9a398 0e61117 4ac1f80 0e61117 3e73f62 0e61117 4ac1f80 0e61117 4ac1f80 0e61117 ee7d732 0e61117 4ac1f80 0e61117 27c513f 0e61117 4ac1f80 0e61117 4ac1f80 0e61117 586c545 0e61117 abb9aca 0e61117 abb9aca 0e61117 abb9aca 3a15d23 abb9aca 0e61117 abb9aca 0e61117 abb9aca 0e61117 abb9aca 0e61117 fd40d52 0e61117 fd40d52 0e61117 6d3ee3c 0e61117 6d3ee3c 0e61117 586c545 3e73f62 586c545 3e73f62 586c545 0e61117 abb9aca 07a1661 0e61117 abb9aca 0e61117 07a1661 0e61117 3420ee0 0e61117 3cb9a37 a8562ee 3cb9a37 7cc3172 33b499e 3cb9a37 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 |
"""
Flask API gateway (local-only version).
Routes
------
POST /presign
POST /upload/<runId>
POST /runs
GET /runs/<runId>
GET /artifacts/<filename>
GET /outputs/<filename>
GET /work/<id>
GET /topics
GET /creators
GET /cell-sim
POST /heatmap
"""
import json
import os
import uuid
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timezone
from mimetypes import guess_type
from pathlib import Path
from threading import RLock
from flask import Flask, jsonify, request, send_from_directory, render_template_string
from flask_cors import CORS
# --------------------------------------------------------------------------- #
# Phase 1: Stub mode for Hugging Face Spaces #
# --------------------------------------------------------------------------- #
STUB_MODE = os.getenv("STUB_MODE", "1") == "1" # set to "0" later for real ML
# Global variables for tasks module
tasks = None
inference = None
# Add this near the top, after the STUB_MODE check
if not STUB_MODE:
try:
# Test basic ML imports
import torch
import transformers
import peft
import cv2
print(f"β
ML imports successful: torch {torch.__version__}")
print(f"β
ML imports successful: transformers {transformers.__version__}")
print(f"β
ML imports successful: peft {peft.__version__}")
print(f"β
ML imports successful: opencv {cv2.__version__}")
# Import ML modules
from . import inference as inference_module, tasks as tasks_module
from .inference import compute_heatmap
tasks = tasks_module
inference = inference_module
RUNS = tasks.runs
RUNS_LOCK = tasks.runs_lock
except Exception as e:
print(f"β ML import failed: {e}")
# Fall back to stub mode
STUB_MODE = True
# Use tasks.runs if available; otherwise a local in-memory store
if STUB_MODE or tasks is None:
# Stub mode: lightweight imports only
RUNS: dict[str, dict] = {}
RUNS_LOCK = RLock()
# --------------------------------------------------------------------------- #
# Flask application & thread pool setup #
# --------------------------------------------------------------------------- #
app = Flask(__name__)
# Configure CORS for HF Spaces
CORS(app, resources={
r"/*": {
"origins": [
"https://huggingface.co",
"https://*.hf.space",
"http://localhost:5173", # Local development
"http://localhost:3000" # Local development
]
}
})
# Configure port for HF Spaces
if os.getenv("PORT"):
app.config['SERVER_NAME'] = f"0.0.0.0:{os.getenv('PORT')}"
# Thread pool to handle background inference tasks
# Reduce workers for HF Spaces memory constraints
max_workers = int(os.getenv("MAX_WORKERS", "2")) # Default to 2 for HF Spaces
executor = ThreadPoolExecutor(max_workers=max_workers)
# Use the Space data volume, not the repo folder
from .config import (
ARTIFACTS_DIR,
OUTPUTS_DIR,
JSON_INFO_DIR,
MARKER_DIR,
JSON_DATASETS,
EMBEDDINGS_DATASETS,
get_markdown_dir,
clear_markdown_cache,
get_markdown_cache_info,
WRITE_ROOT
)
# Import data from config (loaded from HF datasets)
from .config import sentences, works, creators, topics, topic_names
# --------------------------------------------------------------------------- #
# Global Data (loaded from HF datasets via config) #
# --------------------------------------------------------------------------- #
# Data is now loaded from Hugging Face datasets in config.py
# No need to load from local files anymore
# Debug logging for data loading
print(f"οΏ½οΏ½ Data loaded from HF datasets:")
print(f"π Sentences: {len(sentences)} entries")
print(f"π Works: {len(works)} entries")
print(f"π Topics: {len(topics)} entries")
print(f"οΏ½οΏ½ Creators: {len(creators)} entries")
# Initialize markdown dataset
print(f"π Initializing markdown dataset...")
# Check if we should force a full download on startup
FORCE_FULL_DOWNLOAD = os.getenv("FORCE_FULL_DOWNLOAD", "false").lower() == "true"
if FORCE_FULL_DOWNLOAD:
print("π FORCE_FULL_DOWNLOAD=true - forcing complete re-download")
markdown_dir = get_markdown_dir(force_refresh=True)
else:
markdown_dir = get_markdown_dir()
print(f"π Markdown directory: {markdown_dir}")
print(f"οΏ½οΏ½ Markdown directory exists: {markdown_dir.exists()}")
if markdown_dir.exists():
work_count = len([d for d in markdown_dir.iterdir() if d.is_dir()])
print(f"οΏ½οΏ½ Found {work_count} work directories")
print(f"π Topic names: {len(topic_names)} entries")
# --------------------------------------------------------------------------- #
# Routes #
# --------------------------------------------------------------------------- #
@app.route("/health")
def health() -> str:
return "ok"
@app.route("/")
def index():
"""Serve the main frontend page."""
# Read the HTML file and serve it
html_path = Path(__file__).parent.parent.parent / "frontend" / "index.html"
if html_path.exists():
return html_path.read_text(encoding="utf-8")
else:
return "Frontend not found", 404
# Serve static frontend files
@app.route("/css/<path:filename>")
def serve_css(filename):
"""Serve CSS files."""
css_dir = Path(__file__).parent.parent.parent / "frontend" / "css"
return send_from_directory(css_dir, filename)
# Serve static frontend files with proper error handling
@app.route("/js/<path:filename>")
def serve_js(filename):
try:
js_dir = Path(__file__).parent.parent.parent / "frontend" / "js"
if not js_dir.exists():
return "JavaScript directory not found", 404
return send_from_directory(js_dir, filename)
except Exception as e:
print(f"Error serving JS file {filename}: {e}")
return "Internal server error", 500
# Route for work_id images (only matches actual work_ids)
@app.route("/images/W<work_id>", methods=["GET"])
def list_work_images(work_id: str):
"""
Return absolute URLs for all JPEG / PNG images that belong to <work_id>.
Only matches work_ids that start with 'W' followed by numbers.
"""
# Validate that work_id is numeric
if not work_id.isdigit():
return "Invalid work_id format", 400
full_work_id = f"W{work_id}"
print(f"π list_work_images called with work_id: {full_work_id}")
work_dir = get_markdown_dir() / full_work_id
print(f"π Work directory: {work_dir}")
print(f"π Work directory exists: {work_dir.exists()}")
if work_dir.exists():
work_contents = list(work_dir.iterdir())
print(f"π Work directory contents: {[f.name for f in work_contents]}")
img_dir = work_dir / "images"
if not img_dir.is_dir():
print(f"β Images directory not found: {img_dir}")
# Fallback: look for images directly in the work directory
print(f"π Checking for images directly in work directory...")
files = sorted(
f for f in work_dir.iterdir() if f.suffix.lower() in (".jpg", ".jpeg", ".png")
)
if files:
print(f"β
Found {len(files)} images directly in work directory")
host = request.host_url.rstrip("/")
urls = [f"{host}/marker/{full_work_id}/{f.name}" for f in files]
return jsonify(urls)
else:
print(f"β No images found in work directory either")
return jsonify([])
else:
print(f"β
Images directory found: {img_dir}")
files = sorted(
f for f in img_dir.iterdir() if f.suffix.lower() in (".jpg", ".jpeg", ".png")
)
print(f"π Found {len(files)} images in images directory")
host = request.host_url.rstrip("/")
urls = [f"{host}/marker/{full_work_id}/{f.name}" for f in files]
return jsonify(urls)
# Route for frontend images (catches everything else)
@app.route("/images/<path:filename>")
def serve_images(filename):
"""Serve image files."""
print(f"π serve_images called with filename: {filename}")
images_dir = Path(__file__).parent.parent.parent / "frontend" / "images"
print(f"π Images directory: {images_dir}")
print(f"π Images directory exists: {images_dir.exists()}")
print(f"π Looking for file: {images_dir / filename}")
print(f"π File exists: {(images_dir / filename).exists()}")
if not images_dir.exists():
return "Images directory not found", 404
if not (images_dir / filename).exists():
return f"Image file {filename} not found", 404
mime, _ = guess_type(filename)
print(f"π MIME type: {mime}")
return send_from_directory(images_dir, filename, mimetype=mime)
@app.route("/presign", methods=["POST"])
def presign_upload():
run_id = uuid.uuid4().hex
image_key = f"artifacts/{run_id}.jpg"
# Use HF Spaces environment variables
if os.getenv("SPACE_URL"):
base_url = os.getenv("SPACE_URL")
elif os.getenv("SPACE_HOST"):
base_url = f"https://{os.getenv('SPACE_HOST')}"
else:
# Fallback for local development
base_url = request.host_url.rstrip("/")
upload_url = f"{base_url}/upload/{run_id}"
return jsonify({
"runId": run_id,
"imageKey": image_key,
"upload": {"url": upload_url, "fields": {}},
})
@app.route("/upload/<run_id>", methods=["POST"])
def upload_file(run_id: str):
"""
Receives the image file upload for the given runId and saves it to disk.
"""
try:
print(f"π€ Upload request for run {run_id}")
if "file" not in request.files:
print(f"β No file in request for run {run_id}")
return jsonify({"error": "no-file"}), 400
file = request.files["file"]
print(f"π€ File received: {file.filename}, size: {file.content_length if hasattr(file, 'content_length') else 'unknown'}")
# Ensure artifacts directory exists
try:
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
print(f"π€ Artifacts directory: {ARTIFACTS_DIR} (exists: {ARTIFACTS_DIR.exists()})")
except Exception as e:
print(f"β Failed to create artifacts directory: {e}")
return jsonify({"error": f"directory-creation-failed: {str(e)}"}), 500
# Save the file as artifacts/<runId>.jpg
file_path = ARTIFACTS_DIR / f"{run_id}.jpg"
print(f"π€ Saving file to {file_path}")
try:
file.save(str(file_path))
except Exception as e:
print(f"β Failed to save file: {e}")
return jsonify({"error": f"file-save-failed: {str(e)}"}), 500
# Check file exists otherwise 500
if not file_path.exists():
print(f"β File not saved for run {run_id}")
return jsonify({"error": "file-not-saved"}), 500
file_size = file_path.stat().st_size
print(f"β
File saved successfully for run {run_id}, size: {file_size} bytes")
# Respond with 204 No Content (success, no response body)
return "", 204
except Exception as e:
print(f"β Unexpected error in upload_file: {e}")
import traceback
traceback.print_exc()
return jsonify({"error": f"unexpected-error: {str(e)}"}), 500
@app.route("/runs", methods=["POST"])
def create_run():
"""
Body: {
"runId": "...",
"imageKey": "artifacts/...jpg",
"topics": [...],
"creators": [...],
"model": "..." }
- Save initial run status in memory
- Launch background thread for processing
"""
payload = request.get_json(force=True)
print(f"π create_run called with payload: {payload}")
run_id = payload["runId"]
image_key = payload["imageKey"]
topics = payload.get("topics", [])
creators = payload.get("creators", [])
model = payload.get("model", "paintingclip").lower() # Convert to lowercase
now = datetime.now(timezone.utc).isoformat(timespec="seconds")
print(f"π Parsed: run_id={run_id}, image_key={image_key}, topics={topics}, creators={creators}, model={model}")
# Store initial run info in the in-memory dictionary
with RUNS_LOCK:
RUNS[run_id] = {
"runId": run_id,
"status": "queued",
"imageKey": image_key,
"topics": topics,
"creators": creators,
"model": model,
"createdAt": now,
"updatedAt": now,
}
if STUB_MODE:
print(f"π Stub mode: generating fake results for {run_id}")
# Write a tiny fake result so the UI flows
results = {
"runId": run_id,
"model": model,
"top_k": 25,
"sentences": [
{
"id": f"W123_s{i:04d}",
"text": f"Stub sentence {i}.",
"english_original": f"Stub sentence {i}.", # Add this field
"work": f"W123", # Add this field
"score": 0.9 - i*0.01
}
for i in range(1, 6)
],
}
out_path = OUTPUTS_DIR / f"{run_id}.json"
print(f"π Stub mode: writing results to {out_path}")
out_path.write_text(json.dumps(results, ensure_ascii=False, indent=2), encoding="utf-8")
with RUNS_LOCK:
RUNS[run_id].update({
"status": "done", # β Change from "completed" to "done"
"outputKey": f"outputs/{out_path.name}",
"finishedAt": now,
"updatedAt": now
})
print(f"π Stub mode: returning results directly for {run_id}")
return jsonify(results), 200
else:
# Submit the background inference task to the thread pool
image_path = ARTIFACTS_DIR / f"{run_id}.jpg"
print(f"π Real ML mode: submitting task for {run_id} with image {image_path}")
print(f"π Topics: {topics}, Creators: {creators}, Model: {model}")
executor.submit(tasks.run_task, run_id, str(image_path), topics, creators, model)
return jsonify({"status": "accepted"}), 202
@app.route("/runs/<run_id>", methods=["GET"])
def get_run(run_id: str):
"""
Return the status of the run (from in-memory store).
"""
run = RUNS.get(run_id)
if run is None:
return jsonify({"error": "not-found"}), 404
return jsonify(run)
@app.route("/artifacts/<path:filename>", methods=["GET"])
def get_artifact_file(filename: str):
"""Serve an uploaded image from the artifacts directory."""
return send_from_directory(ARTIFACTS_DIR, filename)
@app.route("/outputs/<path:filename>", methods=["GET"])
def get_output_file(filename: str):
"""Serve a JSON output file from the outputs directory."""
# If the filename doesn't end with .json, add it
if not filename.endswith(".json"):
filename = filename + ".json"
# Check if file exists
file_path = OUTPUTS_DIR / filename
if not file_path.exists():
return jsonify({"error": "file-not-found"}), 404
return send_from_directory(OUTPUTS_DIR, filename)
@app.route("/work/<id>", methods=["GET"])
def get_work(id: str):
"""
Return metadata for a work plus (optionally) the paragraph that contains
a given sentence.
Query params
------------
sentence : original-English sentence text (URL-encoded)
"""
print(f"π get_work called with id: {id}")
work = works.get(id)
if work is None:
print(f"β Work not found: {id}")
print(f"π Available works: {len(works)} total")
if len(works) > 0:
sample_keys = list(works.keys())[:5]
print(f"π Sample work IDs: {sample_keys}")
return jsonify({}), 404
# ---------------- context lookup ----------------
sentence = request.args.get("sentence", "").strip()
context = ""
if sentence:
print(f"π Looking for context for sentence: {sentence[:100]}...")
md_path = get_markdown_dir() / id / f"{id}.md"
print(f"π Markdown path: {md_path}")
if md_path.is_file():
print(f"β
Markdown file found, reading content...")
content = md_path.read_text(encoding="utf-8", errors="ignore")
print(f"π Content length: {len(content)} characters")
import re
from difflib import SequenceMatcher
def normalise(txt: str) -> str:
"""lower-case, remove punctuation, collapse whitespace"""
txt = re.sub(r"[^\w\s]", " ", txt.lower())
return re.sub(r"\s+", " ", txt).strip()
target_norm = normalise(sentence)
best_para = ""
best_ratio = 0.0
# split on blank lines β paragraphs
for para in (p.strip() for p in content.split("\n\n") if p.strip()):
para_norm = normalise(para)
# 1) quick exact-substring on normalised text
if target_norm in para_norm:
context = para
break
# 2) otherwise keep best fuzzy match
ratio = SequenceMatcher(None, target_norm, para_norm).ratio()
if ratio > best_ratio:
best_ratio = ratio
best_para = para
# accept fuzzy hit if fairly close
if not context and best_ratio >= 0.55:
context = best_para
else:
print(f"β Markdown file not found: {md_path}")
else:
print(f"π No sentence provided for context lookup")
print(f"π Final context length: {len(context)} characters")
payload = {**work, "context": context}
return jsonify(payload)
@app.route("/topics", methods=["GET"])
def get_topics():
if STUB_MODE:
return jsonify({
"C52119013": "Art History",
"T13922": "Historical Art and Culture Studies",
"T12632": "Visual Culture and Art Theory"
})
return jsonify(topic_names)
@app.route("/creators", methods=["GET"])
def get_creators():
if STUB_MODE:
return jsonify({
"arthur_hughes": ["W4206160935", "W2029124454"],
"francesco_hayez": ["W1982215463", "W4388661114"],
"george_stubbs": ["W2020798572", "W2021094421"]
})
return jsonify(creators)
@app.route("/models", methods=["GET"])
def get_models():
"""
Return the list of models.
"""
return jsonify(["CLIP", "PaintingCLIP"])
@app.route("/cell-sim", methods=["GET"])
def cell_similarity():
if STUB_MODE:
# Return stub results that match the expected frontend structure
return jsonify({
"sentences": [
{
"sentence_id": f"W123_s{i:04d}",
"english_original": f"Stub cell sentence {i} for testing.",
"work": "W123",
"score": 0.9 - i*0.01,
"rank": i
}
for i in range(1, 6)
]
})
try:
run_id = request.args["runId"]
row = int(request.args["row"])
col = int(request.args["col"])
k = int(request.args.get("k", 25))
# Get the run info to retrieve filtering parameters
run_info = RUNS.get(run_id, {})
topics = run_info.get("topics", [])
creators = run_info.get("creators", [])
model = run_info.get("model", "paintingclip").lower() # Convert to lowercase
img_path = ARTIFACTS_DIR / f"{run_id}.jpg"
if not img_path.exists():
return jsonify({"error": "Image not found"}), 404
results = inference.run_inference(
str(img_path),
cell=(row, col),
top_k=k,
filter_topics=topics,
filter_creators=creators,
model_type=model,
)
return jsonify(results)
except Exception as e:
print(f"β Error in cell_similarity: {e}")
return jsonify({"error": str(e)}), 500
# --------------------------------------------------------------------------- #
# Accurate Grad-ECLIP heat-map #
# --------------------------------------------------------------------------- #
@app.route("/heatmap", methods=["POST"])
def heatmap():
"""
Body:
{
"runId": "...",
"sentence": "Full English Original text β¦",
"layerIdx": -1 # optional, defaults to last block
}
Response:
{ "dataUrl": "data:image/png;base64,..." }
"""
if STUB_MODE:
# Return a stub heatmap for Phase 1
return jsonify({"dataUrl": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=="})
payload = request.get_json(force=True)
run_id = payload["runId"]
sentence = payload["sentence"]
layer = int(payload.get("layerIdx", -1))
# Truncate sentence if it's too long for CLIP (max 77 tokens)
MAX_SENTENCE_LENGTH = 300
if len(sentence) > MAX_SENTENCE_LENGTH:
sentence = sentence[: MAX_SENTENCE_LENGTH - 3] + "..."
# Path of the already-uploaded artefact
img_path = ARTIFACTS_DIR / f"{run_id}.jpg"
if not img_path.exists():
return jsonify({"error": "image-not-found"}), 404
try:
data_url = compute_heatmap(str(img_path), sentence, layer_idx=layer)
return jsonify({"dataUrl": data_url})
except Exception as exc:
print(f"Heatmap generation error: {exc}")
return jsonify({"error": str(exc)}), 500
# --------------------------------------------------------------------------- #
# NEW: marker-output image helpers #
# --------------------------------------------------------------------------- #
@app.route("/marker/<work_id>/<path:filename>", methods=["GET"])
def serve_marker_image(work_id: str, filename: str):
"""
Static file server for data/marker_output/<work_id>/images/<filename>
Falls back to work_id/<filename> if images directory doesn't exist
"""
work_dir = get_markdown_dir() / work_id
img_dir = work_dir / "images"
img_path = img_dir / filename
# Try images directory first
if img_path.exists():
mime, _ = guess_type(filename)
return send_from_directory(img_dir, filename, mimetype=mime)
# Fallback: try work directory directly
work_img_path = work_dir / filename
if work_img_path.exists():
mime, _ = guess_type(filename)
return send_from_directory(work_dir, filename, mimetype=mime)
return jsonify({"error": "not-found"}), 404
# --------------------------------------------------------------------------- #
# NEW: paper file helpers #
# --------------------------------------------------------------------------- #
@app.route("/paper/<path:filename>")
def serve_paper(filename: str):
"""Serve paper files."""
try:
paper_dir = Path(__file__).parent.parent.parent / "frontend" / "paper"
if not paper_dir.exists():
return "Paper directory not found", 404
file_path = paper_dir / filename
if not file_path.exists():
return f"Paper file {filename} not found", 404
return send_from_directory(paper_dir, filename)
except Exception as e:
print(f"Error serving paper file {filename}: {e}")
return "Internal server error", 500
# --------------------------------------------------------------------------- #
# Error Handlers #
# --------------------------------------------------------------------------- #
@app.errorhandler(413) # Payload too large
def too_large(e):
return jsonify({"error": "File too large for HF Spaces"}), 413
@app.errorhandler(500)
def internal_error(e):
return jsonify({"error": "Internal server error"}), 500
# --------------------------------------------------------------------------- #
# Markdown Cache Management Endpoints #
# --------------------------------------------------------------------------- #
@app.route("/cache/info", methods=["GET"])
def cache_info():
"""Get information about the markdown cache"""
try:
info = get_markdown_cache_info()
return jsonify(info)
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route("/cache/clear", methods=["POST"])
def cache_clear():
"""Clear the markdown cache to force re-download"""
try:
success = clear_markdown_cache()
if success:
return jsonify({"message": "Cache cleared successfully"})
else:
return jsonify({"error": "Failed to clear cache"}), 500
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route("/cache/refresh", methods=["POST"])
def cache_refresh():
"""Force refresh the markdown dataset"""
try:
# Clear cache and force re-download
clear_markdown_cache()
markdown_dir = get_markdown_dir(force_refresh=True)
if markdown_dir and markdown_dir.exists():
cache_info = get_markdown_cache_info()
return jsonify({
"message": "Cache refresh initiated successfully",
"cache_info": cache_info
})
else:
return jsonify({"error": "Failed to refresh cache"}), 500
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route("/cache/optimized-download", methods=["POST"])
def cache_optimized_download():
"""Start optimized markdown dataset download with parallel processing"""
try:
from .config import _download_markdown_optimized
# Clear cache first
clear_markdown_cache()
# Get the works directory
markdown_cache_dir = WRITE_ROOT / "markdown_cache"
works_dir = markdown_cache_dir / "works"
# Start optimized download
print("π Starting optimized markdown download...")
result = _download_markdown_optimized(works_dir)
if result and result.exists():
cache_info = get_markdown_cache_info()
return jsonify({
"message": "Optimized download completed successfully",
"cache_info": cache_info
})
else:
return jsonify({"error": "Optimized download failed"}), 500
except Exception as e:
return jsonify({"error": str(e)}), 500
# --------------------------------------------------------------------------- #
if __name__ == "__main__": # invoked via python -m β¦
# Use PORT environment variable for Hugging Face Spaces
port = int(os.getenv("PORT", 7860)) # Default to 7860 for HF Spaces
app.run(host="0.0.0.0", port=port, debug=False)
|