|
|
""" |
|
|
Flask API gateway (local-only version). |
|
|
|
|
|
Routes |
|
|
------ |
|
|
POST /presign |
|
|
POST /upload/<runId> |
|
|
POST /runs |
|
|
GET /runs/<runId> |
|
|
GET /artifacts/<filename> |
|
|
GET /outputs/<filename> |
|
|
GET /work/<id> |
|
|
GET /topics |
|
|
GET /creators |
|
|
GET /cell-sim |
|
|
POST /heatmap |
|
|
""" |
|
|
|
|
|
import json |
|
|
import os |
|
|
import uuid |
|
|
from concurrent.futures import ThreadPoolExecutor |
|
|
from datetime import datetime, timezone |
|
|
from mimetypes import guess_type |
|
|
from pathlib import Path |
|
|
from threading import RLock |
|
|
|
|
|
from flask import Flask, jsonify, request, send_from_directory, render_template_string |
|
|
from flask_cors import CORS |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
STUB_MODE = os.getenv("STUB_MODE", "1") == "1" |
|
|
|
|
|
|
|
|
tasks = None |
|
|
inference = None |
|
|
|
|
|
|
|
|
if not STUB_MODE: |
|
|
try: |
|
|
|
|
|
import torch |
|
|
import transformers |
|
|
import peft |
|
|
import cv2 |
|
|
print(f"✅ ML imports successful: torch {torch.__version__}") |
|
|
print(f"✅ ML imports successful: transformers {transformers.__version__}") |
|
|
print(f"✅ ML imports successful: peft {peft.__version__}") |
|
|
print(f"✅ ML imports successful: opencv {cv2.__version__}") |
|
|
|
|
|
|
|
|
from . import inference as inference_module, tasks as tasks_module |
|
|
from .inference import compute_heatmap |
|
|
tasks = tasks_module |
|
|
inference = inference_module |
|
|
RUNS = tasks.runs |
|
|
RUNS_LOCK = tasks.runs_lock |
|
|
except Exception as e: |
|
|
print(f"❌ ML import failed: {e}") |
|
|
|
|
|
STUB_MODE = True |
|
|
|
|
|
|
|
|
if STUB_MODE or tasks is None: |
|
|
|
|
|
RUNS: dict[str, dict] = {} |
|
|
RUNS_LOCK = RLock() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app = Flask(__name__) |
|
|
|
|
|
CORS(app, resources={ |
|
|
r"/*": { |
|
|
"origins": [ |
|
|
"https://huggingface.co", |
|
|
"https://*.hf.space", |
|
|
"http://localhost:5173", |
|
|
"http://localhost:3000" |
|
|
] |
|
|
} |
|
|
}) |
|
|
|
|
|
|
|
|
if os.getenv("PORT"): |
|
|
app.config['SERVER_NAME'] = f"0.0.0.0:{os.getenv('PORT')}" |
|
|
|
|
|
|
|
|
|
|
|
max_workers = int(os.getenv("MAX_WORKERS", "2")) |
|
|
executor = ThreadPoolExecutor(max_workers=max_workers) |
|
|
|
|
|
|
|
|
from .config import ( |
|
|
ARTIFACTS_DIR, |
|
|
OUTPUTS_DIR, |
|
|
JSON_INFO_DIR, |
|
|
MARKER_DIR, |
|
|
JSON_DATASETS, |
|
|
EMBEDDINGS_DATASETS, |
|
|
get_markdown_dir, |
|
|
clear_markdown_cache, |
|
|
get_markdown_cache_info, |
|
|
WRITE_ROOT |
|
|
) |
|
|
|
|
|
|
|
|
from .config import sentences, works, creators, topics, topic_names |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"�� Data loaded from HF datasets:") |
|
|
print(f"📊 Sentences: {len(sentences)} entries") |
|
|
print(f"📊 Works: {len(works)} entries") |
|
|
print(f"📊 Topics: {len(topics)} entries") |
|
|
print(f"�� Creators: {len(creators)} entries") |
|
|
|
|
|
|
|
|
print(f"📄 Initializing markdown dataset...") |
|
|
|
|
|
FORCE_FULL_DOWNLOAD = os.getenv("FORCE_FULL_DOWNLOAD", "false").lower() == "true" |
|
|
if FORCE_FULL_DOWNLOAD: |
|
|
print("🔄 FORCE_FULL_DOWNLOAD=true - forcing complete re-download") |
|
|
markdown_dir = get_markdown_dir(force_refresh=True) |
|
|
else: |
|
|
markdown_dir = get_markdown_dir() |
|
|
print(f"📄 Markdown directory: {markdown_dir}") |
|
|
print(f"�� Markdown directory exists: {markdown_dir.exists()}") |
|
|
if markdown_dir.exists(): |
|
|
work_count = len([d for d in markdown_dir.iterdir() if d.is_dir()]) |
|
|
print(f"�� Found {work_count} work directories") |
|
|
print(f"📊 Topic names: {len(topic_names)} entries") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.route("/health") |
|
|
def health() -> str: |
|
|
return "ok" |
|
|
|
|
|
|
|
|
@app.route("/") |
|
|
def index(): |
|
|
"""Serve the main frontend page.""" |
|
|
|
|
|
html_path = Path(__file__).parent.parent.parent / "frontend" / "index.html" |
|
|
if html_path.exists(): |
|
|
return html_path.read_text(encoding="utf-8") |
|
|
else: |
|
|
return "Frontend not found", 404 |
|
|
|
|
|
|
|
|
@app.route("/css/<path:filename>") |
|
|
def serve_css(filename): |
|
|
"""Serve CSS files.""" |
|
|
css_dir = Path(__file__).parent.parent.parent / "frontend" / "css" |
|
|
return send_from_directory(css_dir, filename) |
|
|
|
|
|
|
|
|
@app.route("/js/<path:filename>") |
|
|
def serve_js(filename): |
|
|
try: |
|
|
js_dir = Path(__file__).parent.parent.parent / "frontend" / "js" |
|
|
if not js_dir.exists(): |
|
|
return "JavaScript directory not found", 404 |
|
|
return send_from_directory(js_dir, filename) |
|
|
except Exception as e: |
|
|
print(f"Error serving JS file {filename}: {e}") |
|
|
return "Internal server error", 500 |
|
|
|
|
|
|
|
|
@app.route("/images/W<work_id>", methods=["GET"]) |
|
|
def list_work_images(work_id: str): |
|
|
""" |
|
|
Return absolute URLs for all JPEG / PNG images that belong to <work_id>. |
|
|
Only matches work_ids that start with 'W' followed by numbers. |
|
|
""" |
|
|
|
|
|
if not work_id.isdigit(): |
|
|
return "Invalid work_id format", 400 |
|
|
|
|
|
full_work_id = f"W{work_id}" |
|
|
print(f"🔍 list_work_images called with work_id: {full_work_id}") |
|
|
work_dir = get_markdown_dir() / full_work_id |
|
|
print(f"🔍 Work directory: {work_dir}") |
|
|
print(f"🔍 Work directory exists: {work_dir.exists()}") |
|
|
if work_dir.exists(): |
|
|
work_contents = list(work_dir.iterdir()) |
|
|
print(f"🔍 Work directory contents: {[f.name for f in work_contents]}") |
|
|
img_dir = work_dir / "images" |
|
|
if not img_dir.is_dir(): |
|
|
print(f"❌ Images directory not found: {img_dir}") |
|
|
|
|
|
print(f"🔍 Checking for images directly in work directory...") |
|
|
files = sorted( |
|
|
f for f in work_dir.iterdir() if f.suffix.lower() in (".jpg", ".jpeg", ".png") |
|
|
) |
|
|
if files: |
|
|
print(f"✅ Found {len(files)} images directly in work directory") |
|
|
host = request.host_url.rstrip("/") |
|
|
urls = [f"{host}/marker/{full_work_id}/{f.name}" for f in files] |
|
|
return jsonify(urls) |
|
|
else: |
|
|
print(f"❌ No images found in work directory either") |
|
|
return jsonify([]) |
|
|
else: |
|
|
print(f"✅ Images directory found: {img_dir}") |
|
|
files = sorted( |
|
|
f for f in img_dir.iterdir() if f.suffix.lower() in (".jpg", ".jpeg", ".png") |
|
|
) |
|
|
print(f"🔍 Found {len(files)} images in images directory") |
|
|
host = request.host_url.rstrip("/") |
|
|
urls = [f"{host}/marker/{full_work_id}/{f.name}" for f in files] |
|
|
return jsonify(urls) |
|
|
|
|
|
|
|
|
@app.route("/images/<path:filename>") |
|
|
def serve_images(filename): |
|
|
"""Serve image files.""" |
|
|
print(f"🔍 serve_images called with filename: {filename}") |
|
|
images_dir = Path(__file__).parent.parent.parent / "frontend" / "images" |
|
|
print(f"🔍 Images directory: {images_dir}") |
|
|
print(f"🔍 Images directory exists: {images_dir.exists()}") |
|
|
print(f"🔍 Looking for file: {images_dir / filename}") |
|
|
print(f"🔍 File exists: {(images_dir / filename).exists()}") |
|
|
|
|
|
if not images_dir.exists(): |
|
|
return "Images directory not found", 404 |
|
|
|
|
|
if not (images_dir / filename).exists(): |
|
|
return f"Image file {filename} not found", 404 |
|
|
|
|
|
mime, _ = guess_type(filename) |
|
|
print(f"🔍 MIME type: {mime}") |
|
|
return send_from_directory(images_dir, filename, mimetype=mime) |
|
|
|
|
|
|
|
|
@app.route("/presign", methods=["POST"]) |
|
|
def presign_upload(): |
|
|
run_id = uuid.uuid4().hex |
|
|
image_key = f"artifacts/{run_id}.jpg" |
|
|
|
|
|
|
|
|
if os.getenv("SPACE_URL"): |
|
|
base_url = os.getenv("SPACE_URL") |
|
|
elif os.getenv("SPACE_HOST"): |
|
|
base_url = f"https://{os.getenv('SPACE_HOST')}" |
|
|
else: |
|
|
|
|
|
base_url = request.host_url.rstrip("/") |
|
|
|
|
|
upload_url = f"{base_url}/upload/{run_id}" |
|
|
|
|
|
return jsonify({ |
|
|
"runId": run_id, |
|
|
"imageKey": image_key, |
|
|
"upload": {"url": upload_url, "fields": {}}, |
|
|
}) |
|
|
|
|
|
|
|
|
@app.route("/upload/<run_id>", methods=["POST"]) |
|
|
def upload_file(run_id: str): |
|
|
""" |
|
|
Receives the image file upload for the given runId and saves it to disk. |
|
|
""" |
|
|
try: |
|
|
print(f"📤 Upload request for run {run_id}") |
|
|
|
|
|
if "file" not in request.files: |
|
|
print(f"❌ No file in request for run {run_id}") |
|
|
return jsonify({"error": "no-file"}), 400 |
|
|
|
|
|
file = request.files["file"] |
|
|
print(f"📤 File received: {file.filename}, size: {file.content_length if hasattr(file, 'content_length') else 'unknown'}") |
|
|
|
|
|
|
|
|
try: |
|
|
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True) |
|
|
print(f"📤 Artifacts directory: {ARTIFACTS_DIR} (exists: {ARTIFACTS_DIR.exists()})") |
|
|
except Exception as e: |
|
|
print(f"❌ Failed to create artifacts directory: {e}") |
|
|
return jsonify({"error": f"directory-creation-failed: {str(e)}"}), 500 |
|
|
|
|
|
|
|
|
file_path = ARTIFACTS_DIR / f"{run_id}.jpg" |
|
|
print(f"📤 Saving file to {file_path}") |
|
|
|
|
|
try: |
|
|
file.save(str(file_path)) |
|
|
except Exception as e: |
|
|
print(f"❌ Failed to save file: {e}") |
|
|
return jsonify({"error": f"file-save-failed: {str(e)}"}), 500 |
|
|
|
|
|
|
|
|
if not file_path.exists(): |
|
|
print(f"❌ File not saved for run {run_id}") |
|
|
return jsonify({"error": "file-not-saved"}), 500 |
|
|
|
|
|
file_size = file_path.stat().st_size |
|
|
print(f"✅ File saved successfully for run {run_id}, size: {file_size} bytes") |
|
|
|
|
|
|
|
|
return "", 204 |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Unexpected error in upload_file: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
return jsonify({"error": f"unexpected-error: {str(e)}"}), 500 |
|
|
|
|
|
|
|
|
@app.route("/runs", methods=["POST"]) |
|
|
def create_run(): |
|
|
""" |
|
|
Body: { |
|
|
"runId": "...", |
|
|
"imageKey": "artifacts/...jpg", |
|
|
"topics": [...], |
|
|
"creators": [...], |
|
|
"model": "..." } |
|
|
- Save initial run status in memory |
|
|
- Launch background thread for processing |
|
|
""" |
|
|
payload = request.get_json(force=True) |
|
|
print(f"🔍 create_run called with payload: {payload}") |
|
|
|
|
|
run_id = payload["runId"] |
|
|
image_key = payload["imageKey"] |
|
|
topics = payload.get("topics", []) |
|
|
creators = payload.get("creators", []) |
|
|
model = payload.get("model", "paintingclip").lower() |
|
|
now = datetime.now(timezone.utc).isoformat(timespec="seconds") |
|
|
|
|
|
print(f"🔍 Parsed: run_id={run_id}, image_key={image_key}, topics={topics}, creators={creators}, model={model}") |
|
|
|
|
|
|
|
|
with RUNS_LOCK: |
|
|
RUNS[run_id] = { |
|
|
"runId": run_id, |
|
|
"status": "queued", |
|
|
"imageKey": image_key, |
|
|
"topics": topics, |
|
|
"creators": creators, |
|
|
"model": model, |
|
|
"createdAt": now, |
|
|
"updatedAt": now, |
|
|
} |
|
|
|
|
|
if STUB_MODE: |
|
|
print(f"🔍 Stub mode: generating fake results for {run_id}") |
|
|
|
|
|
results = { |
|
|
"runId": run_id, |
|
|
"model": model, |
|
|
"top_k": 25, |
|
|
"sentences": [ |
|
|
{ |
|
|
"id": f"W123_s{i:04d}", |
|
|
"text": f"Stub sentence {i}.", |
|
|
"english_original": f"Stub sentence {i}.", |
|
|
"work": f"W123", |
|
|
"score": 0.9 - i*0.01 |
|
|
} |
|
|
for i in range(1, 6) |
|
|
], |
|
|
} |
|
|
out_path = OUTPUTS_DIR / f"{run_id}.json" |
|
|
print(f"🔍 Stub mode: writing results to {out_path}") |
|
|
out_path.write_text(json.dumps(results, ensure_ascii=False, indent=2), encoding="utf-8") |
|
|
|
|
|
with RUNS_LOCK: |
|
|
RUNS[run_id].update({ |
|
|
"status": "done", |
|
|
"outputKey": f"outputs/{out_path.name}", |
|
|
"finishedAt": now, |
|
|
"updatedAt": now |
|
|
}) |
|
|
print(f"🔍 Stub mode: returning results directly for {run_id}") |
|
|
return jsonify(results), 200 |
|
|
else: |
|
|
|
|
|
image_path = ARTIFACTS_DIR / f"{run_id}.jpg" |
|
|
print(f"🔍 Real ML mode: submitting task for {run_id} with image {image_path}") |
|
|
print(f"🔍 Topics: {topics}, Creators: {creators}, Model: {model}") |
|
|
executor.submit(tasks.run_task, run_id, str(image_path), topics, creators, model) |
|
|
return jsonify({"status": "accepted"}), 202 |
|
|
|
|
|
|
|
|
@app.route("/runs/<run_id>", methods=["GET"]) |
|
|
def get_run(run_id: str): |
|
|
""" |
|
|
Return the status of the run (from in-memory store). |
|
|
""" |
|
|
run = RUNS.get(run_id) |
|
|
if run is None: |
|
|
return jsonify({"error": "not-found"}), 404 |
|
|
return jsonify(run) |
|
|
|
|
|
|
|
|
@app.route("/artifacts/<path:filename>", methods=["GET"]) |
|
|
def get_artifact_file(filename: str): |
|
|
"""Serve an uploaded image from the artifacts directory.""" |
|
|
return send_from_directory(ARTIFACTS_DIR, filename) |
|
|
|
|
|
|
|
|
@app.route("/outputs/<path:filename>", methods=["GET"]) |
|
|
def get_output_file(filename: str): |
|
|
"""Serve a JSON output file from the outputs directory.""" |
|
|
|
|
|
|
|
|
if not filename.endswith(".json"): |
|
|
filename = filename + ".json" |
|
|
|
|
|
|
|
|
file_path = OUTPUTS_DIR / filename |
|
|
if not file_path.exists(): |
|
|
return jsonify({"error": "file-not-found"}), 404 |
|
|
|
|
|
return send_from_directory(OUTPUTS_DIR, filename) |
|
|
|
|
|
|
|
|
@app.route("/work/<id>", methods=["GET"]) |
|
|
def get_work(id: str): |
|
|
""" |
|
|
Return metadata for a work plus (optionally) the paragraph that contains |
|
|
a given sentence. |
|
|
|
|
|
Query params |
|
|
------------ |
|
|
sentence : original-English sentence text (URL-encoded) |
|
|
""" |
|
|
print(f"🔍 get_work called with id: {id}") |
|
|
work = works.get(id) |
|
|
if work is None: |
|
|
print(f"❌ Work not found: {id}") |
|
|
print(f"🔍 Available works: {len(works)} total") |
|
|
if len(works) > 0: |
|
|
sample_keys = list(works.keys())[:5] |
|
|
print(f"🔍 Sample work IDs: {sample_keys}") |
|
|
return jsonify({}), 404 |
|
|
|
|
|
|
|
|
sentence = request.args.get("sentence", "").strip() |
|
|
context = "" |
|
|
if sentence: |
|
|
print(f"🔍 Looking for context for sentence: {sentence[:100]}...") |
|
|
md_path = get_markdown_dir() / id / f"{id}.md" |
|
|
print(f"🔍 Markdown path: {md_path}") |
|
|
if md_path.is_file(): |
|
|
print(f"✅ Markdown file found, reading content...") |
|
|
content = md_path.read_text(encoding="utf-8", errors="ignore") |
|
|
print(f"🔍 Content length: {len(content)} characters") |
|
|
import re |
|
|
from difflib import SequenceMatcher |
|
|
|
|
|
def normalise(txt: str) -> str: |
|
|
"""lower-case, remove punctuation, collapse whitespace""" |
|
|
txt = re.sub(r"[^\w\s]", " ", txt.lower()) |
|
|
return re.sub(r"\s+", " ", txt).strip() |
|
|
|
|
|
target_norm = normalise(sentence) |
|
|
best_para = "" |
|
|
best_ratio = 0.0 |
|
|
|
|
|
|
|
|
for para in (p.strip() for p in content.split("\n\n") if p.strip()): |
|
|
para_norm = normalise(para) |
|
|
|
|
|
|
|
|
if target_norm in para_norm: |
|
|
context = para |
|
|
break |
|
|
|
|
|
|
|
|
ratio = SequenceMatcher(None, target_norm, para_norm).ratio() |
|
|
if ratio > best_ratio: |
|
|
best_ratio = ratio |
|
|
best_para = para |
|
|
|
|
|
|
|
|
if not context and best_ratio >= 0.55: |
|
|
context = best_para |
|
|
|
|
|
else: |
|
|
print(f"❌ Markdown file not found: {md_path}") |
|
|
else: |
|
|
print(f"🔍 No sentence provided for context lookup") |
|
|
|
|
|
print(f"🔍 Final context length: {len(context)} characters") |
|
|
payload = {**work, "context": context} |
|
|
return jsonify(payload) |
|
|
|
|
|
|
|
|
@app.route("/topics", methods=["GET"]) |
|
|
def get_topics(): |
|
|
if STUB_MODE: |
|
|
return jsonify({ |
|
|
"C52119013": "Art History", |
|
|
"T13922": "Historical Art and Culture Studies", |
|
|
"T12632": "Visual Culture and Art Theory" |
|
|
}) |
|
|
return jsonify(topic_names) |
|
|
|
|
|
|
|
|
@app.route("/creators", methods=["GET"]) |
|
|
def get_creators(): |
|
|
if STUB_MODE: |
|
|
return jsonify({ |
|
|
"arthur_hughes": ["W4206160935", "W2029124454"], |
|
|
"francesco_hayez": ["W1982215463", "W4388661114"], |
|
|
"george_stubbs": ["W2020798572", "W2021094421"] |
|
|
}) |
|
|
return jsonify(creators) |
|
|
|
|
|
|
|
|
@app.route("/models", methods=["GET"]) |
|
|
def get_models(): |
|
|
""" |
|
|
Return the list of models. |
|
|
""" |
|
|
return jsonify(["CLIP", "PaintingCLIP"]) |
|
|
|
|
|
|
|
|
@app.route("/cell-sim", methods=["GET"]) |
|
|
def cell_similarity(): |
|
|
if STUB_MODE: |
|
|
|
|
|
return jsonify({ |
|
|
"sentences": [ |
|
|
{ |
|
|
"sentence_id": f"W123_s{i:04d}", |
|
|
"english_original": f"Stub cell sentence {i} for testing.", |
|
|
"work": "W123", |
|
|
"score": 0.9 - i*0.01, |
|
|
"rank": i |
|
|
} |
|
|
for i in range(1, 6) |
|
|
] |
|
|
}) |
|
|
|
|
|
try: |
|
|
run_id = request.args["runId"] |
|
|
row = int(request.args["row"]) |
|
|
col = int(request.args["col"]) |
|
|
k = int(request.args.get("k", 25)) |
|
|
|
|
|
|
|
|
run_info = RUNS.get(run_id, {}) |
|
|
topics = run_info.get("topics", []) |
|
|
creators = run_info.get("creators", []) |
|
|
model = run_info.get("model", "paintingclip").lower() |
|
|
|
|
|
img_path = ARTIFACTS_DIR / f"{run_id}.jpg" |
|
|
if not img_path.exists(): |
|
|
return jsonify({"error": "Image not found"}), 404 |
|
|
|
|
|
results = inference.run_inference( |
|
|
str(img_path), |
|
|
cell=(row, col), |
|
|
top_k=k, |
|
|
filter_topics=topics, |
|
|
filter_creators=creators, |
|
|
model_type=model, |
|
|
) |
|
|
return jsonify(results) |
|
|
except Exception as e: |
|
|
print(f"❌ Error in cell_similarity: {e}") |
|
|
return jsonify({"error": str(e)}), 500 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.route("/heatmap", methods=["POST"]) |
|
|
def heatmap(): |
|
|
""" |
|
|
Body: |
|
|
{ |
|
|
"runId": "...", |
|
|
"sentence": "Full English Original text …", |
|
|
"layerIdx": -1 # optional, defaults to last block |
|
|
} |
|
|
|
|
|
Response: |
|
|
{ "dataUrl": "data:image/png;base64,..." } |
|
|
""" |
|
|
if STUB_MODE: |
|
|
|
|
|
return jsonify({"dataUrl": ""}) |
|
|
|
|
|
payload = request.get_json(force=True) |
|
|
run_id = payload["runId"] |
|
|
sentence = payload["sentence"] |
|
|
layer = int(payload.get("layerIdx", -1)) |
|
|
|
|
|
|
|
|
MAX_SENTENCE_LENGTH = 300 |
|
|
if len(sentence) > MAX_SENTENCE_LENGTH: |
|
|
sentence = sentence[: MAX_SENTENCE_LENGTH - 3] + "..." |
|
|
|
|
|
|
|
|
img_path = ARTIFACTS_DIR / f"{run_id}.jpg" |
|
|
if not img_path.exists(): |
|
|
return jsonify({"error": "image-not-found"}), 404 |
|
|
|
|
|
try: |
|
|
data_url = compute_heatmap(str(img_path), sentence, layer_idx=layer) |
|
|
return jsonify({"dataUrl": data_url}) |
|
|
except Exception as exc: |
|
|
print(f"Heatmap generation error: {exc}") |
|
|
return jsonify({"error": str(exc)}), 500 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.route("/marker/<work_id>/<path:filename>", methods=["GET"]) |
|
|
def serve_marker_image(work_id: str, filename: str): |
|
|
""" |
|
|
Static file server for data/marker_output/<work_id>/images/<filename> |
|
|
Falls back to work_id/<filename> if images directory doesn't exist |
|
|
""" |
|
|
work_dir = get_markdown_dir() / work_id |
|
|
img_dir = work_dir / "images" |
|
|
img_path = img_dir / filename |
|
|
|
|
|
|
|
|
if img_path.exists(): |
|
|
mime, _ = guess_type(filename) |
|
|
return send_from_directory(img_dir, filename, mimetype=mime) |
|
|
|
|
|
|
|
|
work_img_path = work_dir / filename |
|
|
if work_img_path.exists(): |
|
|
mime, _ = guess_type(filename) |
|
|
return send_from_directory(work_dir, filename, mimetype=mime) |
|
|
|
|
|
return jsonify({"error": "not-found"}), 404 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.route("/paper/<path:filename>") |
|
|
def serve_paper(filename: str): |
|
|
"""Serve paper files.""" |
|
|
try: |
|
|
paper_dir = Path(__file__).parent.parent.parent / "frontend" / "paper" |
|
|
if not paper_dir.exists(): |
|
|
return "Paper directory not found", 404 |
|
|
|
|
|
file_path = paper_dir / filename |
|
|
if not file_path.exists(): |
|
|
return f"Paper file {filename} not found", 404 |
|
|
|
|
|
return send_from_directory(paper_dir, filename) |
|
|
except Exception as e: |
|
|
print(f"Error serving paper file {filename}: {e}") |
|
|
return "Internal server error", 500 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.errorhandler(413) |
|
|
def too_large(e): |
|
|
return jsonify({"error": "File too large for HF Spaces"}), 413 |
|
|
|
|
|
@app.errorhandler(500) |
|
|
def internal_error(e): |
|
|
return jsonify({"error": "Internal server error"}), 500 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.route("/cache/info", methods=["GET"]) |
|
|
def cache_info(): |
|
|
"""Get information about the markdown cache""" |
|
|
try: |
|
|
info = get_markdown_cache_info() |
|
|
return jsonify(info) |
|
|
except Exception as e: |
|
|
return jsonify({"error": str(e)}), 500 |
|
|
|
|
|
@app.route("/cache/clear", methods=["POST"]) |
|
|
def cache_clear(): |
|
|
"""Clear the markdown cache to force re-download""" |
|
|
try: |
|
|
success = clear_markdown_cache() |
|
|
if success: |
|
|
return jsonify({"message": "Cache cleared successfully"}) |
|
|
else: |
|
|
return jsonify({"error": "Failed to clear cache"}), 500 |
|
|
except Exception as e: |
|
|
return jsonify({"error": str(e)}), 500 |
|
|
|
|
|
@app.route("/cache/refresh", methods=["POST"]) |
|
|
def cache_refresh(): |
|
|
"""Force refresh the markdown dataset""" |
|
|
try: |
|
|
|
|
|
clear_markdown_cache() |
|
|
markdown_dir = get_markdown_dir(force_refresh=True) |
|
|
|
|
|
if markdown_dir and markdown_dir.exists(): |
|
|
cache_info = get_markdown_cache_info() |
|
|
return jsonify({ |
|
|
"message": "Cache refresh initiated successfully", |
|
|
"cache_info": cache_info |
|
|
}) |
|
|
else: |
|
|
return jsonify({"error": "Failed to refresh cache"}), 500 |
|
|
except Exception as e: |
|
|
return jsonify({"error": str(e)}), 500 |
|
|
|
|
|
@app.route("/cache/optimized-download", methods=["POST"]) |
|
|
def cache_optimized_download(): |
|
|
"""Start optimized markdown dataset download with parallel processing""" |
|
|
try: |
|
|
from .config import _download_markdown_optimized |
|
|
|
|
|
|
|
|
clear_markdown_cache() |
|
|
|
|
|
|
|
|
markdown_cache_dir = WRITE_ROOT / "markdown_cache" |
|
|
works_dir = markdown_cache_dir / "works" |
|
|
|
|
|
|
|
|
print("🚀 Starting optimized markdown download...") |
|
|
result = _download_markdown_optimized(works_dir) |
|
|
|
|
|
if result and result.exists(): |
|
|
cache_info = get_markdown_cache_info() |
|
|
return jsonify({ |
|
|
"message": "Optimized download completed successfully", |
|
|
"cache_info": cache_info |
|
|
}) |
|
|
else: |
|
|
return jsonify({"error": "Optimized download failed"}), 500 |
|
|
except Exception as e: |
|
|
return jsonify({"error": str(e)}), 500 |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
port = int(os.getenv("PORT", 7860)) |
|
|
app.run(host="0.0.0.0", port=port, debug=False) |
|
|
|