File size: 27,403 Bytes
0e61117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81d196c
 
 
 
56c637d
 
 
 
 
 
 
 
 
 
 
 
81d196c
 
 
 
 
 
 
 
56c637d
 
 
 
 
0e61117
81d196c
0e61117
 
 
 
 
 
 
 
a8562ee
 
 
 
 
 
 
 
 
 
 
0e61117
3cb9a37
 
 
 
0e61117
a8562ee
 
 
0e61117
 
80ac589
 
 
 
4f1c614
 
3a15d23
7cc3172
 
5c93ea3
 
80ac589
0e61117
efbac81
 
 
0e61117
efbac81
0e61117
efbac81
 
0e61117
4ac1f80
3a15d23
4ac1f80
 
 
3a15d23
 
 
 
7cc3172
 
 
 
 
 
 
3a15d23
 
 
 
 
4ac1f80
 
0e61117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8562ee
0e61117
 
a8562ee
 
 
 
 
 
 
 
0e61117
7f64014
 
7e0c9d7
 
 
7f64014
7e0c9d7
7f64014
 
 
 
 
 
abb9aca
07a1661
 
 
 
 
abb9aca
7e0c9d7
abb9aca
07a1661
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e0c9d7
7f64014
7e0c9d7
 
7f64014
0e61117
 
 
95cf2e4
0e61117
95cf2e4
 
 
 
 
 
 
 
 
 
 
 
 
 
0e61117
 
 
 
 
 
57a6561
a8562ee
 
 
 
 
57a6561
a8562ee
 
57a6561
 
 
a8562ee
57a6561
 
a8562ee
57a6561
0e61117
 
 
 
 
 
 
2c9a398
 
 
 
 
 
 
 
 
 
 
 
586c545
 
2c9a398
 
 
 
 
586c545
2c9a398
 
 
586c545
2c9a398
 
 
 
 
586c545
2c9a398
 
 
586c545
2c9a398
 
 
586c545
2c9a398
 
 
 
 
 
0e61117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ac1f80
 
0e61117
 
 
 
3e73f62
0e61117
4ac1f80
 
0e61117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ac1f80
0e61117
 
 
 
 
 
ee7d732
 
 
 
 
 
 
0e61117
 
 
 
4ac1f80
0e61117
 
 
 
27c513f
0e61117
 
 
 
4ac1f80
0e61117
 
 
 
4ac1f80
 
0e61117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
586c545
 
0e61117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abb9aca
0e61117
 
abb9aca
 
 
 
 
0e61117
 
 
 
 
 
abb9aca
3a15d23
abb9aca
0e61117
abb9aca
0e61117
abb9aca
0e61117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abb9aca
 
 
 
 
 
0e61117
 
 
 
 
 
fd40d52
 
 
 
 
 
0e61117
 
 
 
 
fd40d52
 
 
 
 
 
0e61117
 
 
 
 
 
 
 
 
 
 
 
 
 
6d3ee3c
0e61117
 
6d3ee3c
 
 
 
 
 
 
0e61117
 
 
 
586c545
 
 
 
 
 
 
 
 
 
3e73f62
586c545
 
 
3e73f62
586c545
 
 
 
 
 
 
 
 
 
 
 
 
0e61117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abb9aca
07a1661
0e61117
abb9aca
 
0e61117
07a1661
 
 
 
 
 
 
 
 
 
 
 
 
0e61117
 
3420ee0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e61117
3cb9a37
 
a8562ee
 
 
 
 
 
 
3cb9a37
7cc3172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33b499e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3cb9a37
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
"""
Flask API gateway (local-only version).

Routes
------
POST /presign
POST /upload/<runId>
POST /runs
GET  /runs/<runId>
GET  /artifacts/<filename>
GET  /outputs/<filename>
GET  /work/<id>
GET  /topics
GET  /creators
GET  /cell-sim
POST /heatmap
"""

import json
import os
import uuid
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timezone
from mimetypes import guess_type
from pathlib import Path
from threading import RLock

from flask import Flask, jsonify, request, send_from_directory, render_template_string
from flask_cors import CORS

# --------------------------------------------------------------------------- #
#  Phase 1: Stub mode for Hugging Face Spaces                                #
# --------------------------------------------------------------------------- #
STUB_MODE = os.getenv("STUB_MODE", "1") == "1"  # set to "0" later for real ML

# Global variables for tasks module
tasks = None
inference = None

# Add this near the top, after the STUB_MODE check
if not STUB_MODE:
    try:
        # Test basic ML imports
        import torch
        import transformers
        import peft
        import cv2
        print(f"βœ… ML imports successful: torch {torch.__version__}")
        print(f"βœ… ML imports successful: transformers {transformers.__version__}")
        print(f"βœ… ML imports successful: peft {peft.__version__}")
        print(f"βœ… ML imports successful: opencv {cv2.__version__}")
        
        # Import ML modules
        from . import inference as inference_module, tasks as tasks_module
        from .inference import compute_heatmap
        tasks = tasks_module
        inference = inference_module
        RUNS = tasks.runs
        RUNS_LOCK = tasks.runs_lock
    except Exception as e:
        print(f"❌ ML import failed: {e}")
        # Fall back to stub mode
        STUB_MODE = True

# Use tasks.runs if available; otherwise a local in-memory store
if STUB_MODE or tasks is None:
    # Stub mode: lightweight imports only
    RUNS: dict[str, dict] = {}
    RUNS_LOCK = RLock()

# --------------------------------------------------------------------------- #
#  Flask application & thread pool setup                                      #
# --------------------------------------------------------------------------- #
app = Flask(__name__)
# Configure CORS for HF Spaces
CORS(app, resources={
    r"/*": {
        "origins": [
            "https://huggingface.co",
            "https://*.hf.space",
            "http://localhost:5173",  # Local development
            "http://localhost:3000"   # Local development
        ]
    }
})

# Configure port for HF Spaces
if os.getenv("PORT"):
    app.config['SERVER_NAME'] = f"0.0.0.0:{os.getenv('PORT')}"

# Thread pool to handle background inference tasks
# Reduce workers for HF Spaces memory constraints
max_workers = int(os.getenv("MAX_WORKERS", "2"))  # Default to 2 for HF Spaces
executor = ThreadPoolExecutor(max_workers=max_workers)

# Use the Space data volume, not the repo folder
from .config import (
    ARTIFACTS_DIR,
    OUTPUTS_DIR,
    JSON_INFO_DIR,
    MARKER_DIR,
    JSON_DATASETS,
    EMBEDDINGS_DATASETS,
    get_markdown_dir,
    clear_markdown_cache,
    get_markdown_cache_info,
    WRITE_ROOT
)

# Import data from config (loaded from HF datasets)
from .config import sentences, works, creators, topics, topic_names

# --------------------------------------------------------------------------- #
#  Global Data (loaded from HF datasets via config)                            #
# --------------------------------------------------------------------------- #
# Data is now loaded from Hugging Face datasets in config.py
# No need to load from local files anymore

# Debug logging for data loading
print(f"οΏ½οΏ½ Data loaded from HF datasets:")
print(f"πŸ“Š   Sentences: {len(sentences)} entries")
print(f"πŸ“Š   Works: {len(works)} entries")
print(f"πŸ“Š   Topics: {len(topics)} entries")
print(f"οΏ½οΏ½   Creators: {len(creators)} entries")

# Initialize markdown dataset
print(f"πŸ“„ Initializing markdown dataset...")
# Check if we should force a full download on startup
FORCE_FULL_DOWNLOAD = os.getenv("FORCE_FULL_DOWNLOAD", "false").lower() == "true"
if FORCE_FULL_DOWNLOAD:
    print("πŸ”„ FORCE_FULL_DOWNLOAD=true - forcing complete re-download")
    markdown_dir = get_markdown_dir(force_refresh=True)
else:
    markdown_dir = get_markdown_dir()
print(f"πŸ“„ Markdown directory: {markdown_dir}")
print(f"οΏ½οΏ½ Markdown directory exists: {markdown_dir.exists()}")
if markdown_dir.exists():
    work_count = len([d for d in markdown_dir.iterdir() if d.is_dir()])
    print(f"οΏ½οΏ½ Found {work_count} work directories")
print(f"πŸ“Š   Topic names: {len(topic_names)} entries")


# --------------------------------------------------------------------------- #
#  Routes                                                                     #
# --------------------------------------------------------------------------- #
@app.route("/health")
def health() -> str:
    return "ok"


@app.route("/")
def index():
    """Serve the main frontend page."""
    # Read the HTML file and serve it
    html_path = Path(__file__).parent.parent.parent / "frontend" / "index.html"
    if html_path.exists():
        return html_path.read_text(encoding="utf-8")
    else:
        return "Frontend not found", 404

# Serve static frontend files
@app.route("/css/<path:filename>")
def serve_css(filename):
    """Serve CSS files."""
    css_dir = Path(__file__).parent.parent.parent / "frontend" / "css"
    return send_from_directory(css_dir, filename)

# Serve static frontend files with proper error handling
@app.route("/js/<path:filename>")
def serve_js(filename):
    try:
        js_dir = Path(__file__).parent.parent.parent / "frontend" / "js"
        if not js_dir.exists():
            return "JavaScript directory not found", 404
        return send_from_directory(js_dir, filename)
    except Exception as e:
        print(f"Error serving JS file {filename}: {e}")
        return "Internal server error", 500

# Route for work_id images (only matches actual work_ids)
@app.route("/images/W<work_id>", methods=["GET"])
def list_work_images(work_id: str):
    """
    Return absolute URLs for all JPEG / PNG images that belong to <work_id>.
    Only matches work_ids that start with 'W' followed by numbers.
    """
    # Validate that work_id is numeric
    if not work_id.isdigit():
        return "Invalid work_id format", 400
    
    full_work_id = f"W{work_id}"
    print(f"πŸ” list_work_images called with work_id: {full_work_id}")
    work_dir = get_markdown_dir() / full_work_id
    print(f"πŸ” Work directory: {work_dir}")
    print(f"πŸ” Work directory exists: {work_dir.exists()}")
    if work_dir.exists():
        work_contents = list(work_dir.iterdir())
        print(f"πŸ” Work directory contents: {[f.name for f in work_contents]}")
    img_dir = work_dir / "images"
    if not img_dir.is_dir():
        print(f"❌ Images directory not found: {img_dir}")
        # Fallback: look for images directly in the work directory
        print(f"πŸ” Checking for images directly in work directory...")
        files = sorted(
            f for f in work_dir.iterdir() if f.suffix.lower() in (".jpg", ".jpeg", ".png")
        )
        if files:
            print(f"βœ… Found {len(files)} images directly in work directory")
            host = request.host_url.rstrip("/")
            urls = [f"{host}/marker/{full_work_id}/{f.name}" for f in files]
            return jsonify(urls)
        else:
            print(f"❌ No images found in work directory either")
            return jsonify([])
    else:
        print(f"βœ… Images directory found: {img_dir}")
        files = sorted(
            f for f in img_dir.iterdir() if f.suffix.lower() in (".jpg", ".jpeg", ".png")
        )
        print(f"πŸ” Found {len(files)} images in images directory")
    host = request.host_url.rstrip("/")
    urls = [f"{host}/marker/{full_work_id}/{f.name}" for f in files]
    return jsonify(urls)

# Route for frontend images (catches everything else)
@app.route("/images/<path:filename>")
def serve_images(filename):
    """Serve image files."""
    print(f"πŸ” serve_images called with filename: {filename}")
    images_dir = Path(__file__).parent.parent.parent / "frontend" / "images"
    print(f"πŸ” Images directory: {images_dir}")
    print(f"πŸ” Images directory exists: {images_dir.exists()}")
    print(f"πŸ” Looking for file: {images_dir / filename}")
    print(f"πŸ” File exists: {(images_dir / filename).exists()}")
    
    if not images_dir.exists():
        return "Images directory not found", 404
    
    if not (images_dir / filename).exists():
        return f"Image file {filename} not found", 404
    
    mime, _ = guess_type(filename)
    print(f"πŸ” MIME type: {mime}")
    return send_from_directory(images_dir, filename, mimetype=mime)


@app.route("/presign", methods=["POST"])
def presign_upload():
    run_id = uuid.uuid4().hex
    image_key = f"artifacts/{run_id}.jpg"
    
    # Use HF Spaces environment variables
    if os.getenv("SPACE_URL"):
        base_url = os.getenv("SPACE_URL")
    elif os.getenv("SPACE_HOST"):
        base_url = f"https://{os.getenv('SPACE_HOST')}"
    else:
        # Fallback for local development
        base_url = request.host_url.rstrip("/")
    
    upload_url = f"{base_url}/upload/{run_id}"
    
    return jsonify({
        "runId": run_id,
        "imageKey": image_key,
        "upload": {"url": upload_url, "fields": {}},
    })


@app.route("/upload/<run_id>", methods=["POST"])
def upload_file(run_id: str):
    """
    Receives the image file upload for the given runId and saves it to disk.
    """
    try:
        print(f"πŸ“€ Upload request for run {run_id}")
        
        if "file" not in request.files:
            print(f"❌ No file in request for run {run_id}")
            return jsonify({"error": "no-file"}), 400
        
        file = request.files["file"]
        print(f"πŸ“€ File received: {file.filename}, size: {file.content_length if hasattr(file, 'content_length') else 'unknown'}")
        
        # Ensure artifacts directory exists
        try:
            ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
            print(f"πŸ“€ Artifacts directory: {ARTIFACTS_DIR} (exists: {ARTIFACTS_DIR.exists()})")
        except Exception as e:
            print(f"❌ Failed to create artifacts directory: {e}")
            return jsonify({"error": f"directory-creation-failed: {str(e)}"}), 500
        
        # Save the file as artifacts/<runId>.jpg
        file_path = ARTIFACTS_DIR / f"{run_id}.jpg"
        print(f"πŸ“€ Saving file to {file_path}")
        
        try:
            file.save(str(file_path))
        except Exception as e:
            print(f"❌ Failed to save file: {e}")
            return jsonify({"error": f"file-save-failed: {str(e)}"}), 500
        
        # Check file exists otherwise 500
        if not file_path.exists():
            print(f"❌ File not saved for run {run_id}")
            return jsonify({"error": "file-not-saved"}), 500
        
        file_size = file_path.stat().st_size
        print(f"βœ… File saved successfully for run {run_id}, size: {file_size} bytes")
        
        # Respond with 204 No Content (success, no response body)
        return "", 204
        
    except Exception as e:
        print(f"❌ Unexpected error in upload_file: {e}")
        import traceback
        traceback.print_exc()
        return jsonify({"error": f"unexpected-error: {str(e)}"}), 500


@app.route("/runs", methods=["POST"])
def create_run():
    """
    Body: { 
    "runId": "...", 
    "imageKey": "artifacts/...jpg", 
    "topics": [...], 
    "creators": [...], 
    "model": "..." }
    - Save initial run status in memory
    - Launch background thread for processing
    """
    payload = request.get_json(force=True)
    print(f"πŸ” create_run called with payload: {payload}")
    
    run_id = payload["runId"]
    image_key = payload["imageKey"]
    topics = payload.get("topics", [])
    creators = payload.get("creators", [])
    model = payload.get("model", "paintingclip").lower()  # Convert to lowercase
    now = datetime.now(timezone.utc).isoformat(timespec="seconds")
    
    print(f"πŸ” Parsed: run_id={run_id}, image_key={image_key}, topics={topics}, creators={creators}, model={model}")

    # Store initial run info in the in-memory dictionary
    with RUNS_LOCK:
        RUNS[run_id] = {
            "runId": run_id,
            "status": "queued",
            "imageKey": image_key,
            "topics": topics,
            "creators": creators,
            "model": model,
            "createdAt": now,
            "updatedAt": now,
        }

    if STUB_MODE:
        print(f"πŸ” Stub mode: generating fake results for {run_id}")
        # Write a tiny fake result so the UI flows
        results = {
            "runId": run_id,
            "model": model,
            "top_k": 25,
            "sentences": [
                {
                    "id": f"W123_s{i:04d}", 
                    "text": f"Stub sentence {i}.", 
                    "english_original": f"Stub sentence {i}.",  # Add this field
                    "work": f"W123",  # Add this field
                    "score": 0.9 - i*0.01
                }
                for i in range(1, 6)
            ],
        }
        out_path = OUTPUTS_DIR / f"{run_id}.json"
        print(f"πŸ” Stub mode: writing results to {out_path}")
        out_path.write_text(json.dumps(results, ensure_ascii=False, indent=2), encoding="utf-8")
        
        with RUNS_LOCK:
            RUNS[run_id].update({
                "status": "done",  # ← Change from "completed" to "done"
                "outputKey": f"outputs/{out_path.name}",
                "finishedAt": now,
                "updatedAt": now
            })
        print(f"πŸ” Stub mode: returning results directly for {run_id}")
        return jsonify(results), 200
    else:
        # Submit the background inference task to the thread pool
        image_path = ARTIFACTS_DIR / f"{run_id}.jpg"
        print(f"πŸ” Real ML mode: submitting task for {run_id} with image {image_path}")
        print(f"πŸ” Topics: {topics}, Creators: {creators}, Model: {model}")
        executor.submit(tasks.run_task, run_id, str(image_path), topics, creators, model)
        return jsonify({"status": "accepted"}), 202


@app.route("/runs/<run_id>", methods=["GET"])
def get_run(run_id: str):
    """
    Return the status of the run (from in-memory store).
    """
    run = RUNS.get(run_id)
    if run is None:
        return jsonify({"error": "not-found"}), 404
    return jsonify(run)


@app.route("/artifacts/<path:filename>", methods=["GET"])
def get_artifact_file(filename: str):
    """Serve an uploaded image from the artifacts directory."""
    return send_from_directory(ARTIFACTS_DIR, filename)


@app.route("/outputs/<path:filename>", methods=["GET"])
def get_output_file(filename: str):
    """Serve a JSON output file from the outputs directory."""

    # If the filename doesn't end with .json, add it
    if not filename.endswith(".json"):
        filename = filename + ".json"

    # Check if file exists
    file_path = OUTPUTS_DIR / filename
    if not file_path.exists():
        return jsonify({"error": "file-not-found"}), 404

    return send_from_directory(OUTPUTS_DIR, filename)


@app.route("/work/<id>", methods=["GET"])
def get_work(id: str):
    """
    Return metadata for a work plus (optionally) the paragraph that contains
    a given sentence.

    Query params
    ------------
    sentence : original-English sentence text (URL-encoded)
    """
    print(f"πŸ” get_work called with id: {id}")
    work = works.get(id)
    if work is None:
        print(f"❌ Work not found: {id}")
        print(f"πŸ” Available works: {len(works)} total")
        if len(works) > 0:
            sample_keys = list(works.keys())[:5]
            print(f"πŸ” Sample work IDs: {sample_keys}")
        return jsonify({}), 404

    # ---------------- context lookup ----------------
    sentence = request.args.get("sentence", "").strip()
    context = ""
    if sentence:
        print(f"πŸ” Looking for context for sentence: {sentence[:100]}...")
        md_path = get_markdown_dir() / id / f"{id}.md"
        print(f"πŸ” Markdown path: {md_path}")
        if md_path.is_file():
            print(f"βœ… Markdown file found, reading content...")
            content = md_path.read_text(encoding="utf-8", errors="ignore")
            print(f"πŸ” Content length: {len(content)} characters")
            import re
            from difflib import SequenceMatcher

            def normalise(txt: str) -> str:
                """lower-case, remove punctuation, collapse whitespace"""
                txt = re.sub(r"[^\w\s]", " ", txt.lower())
                return re.sub(r"\s+", " ", txt).strip()

            target_norm = normalise(sentence)
            best_para = ""
            best_ratio = 0.0

            # split on blank lines β†’ paragraphs
            for para in (p.strip() for p in content.split("\n\n") if p.strip()):
                para_norm = normalise(para)

                # 1) quick exact-substring on normalised text
                if target_norm in para_norm:
                    context = para
                    break

                # 2) otherwise keep best fuzzy match
                ratio = SequenceMatcher(None, target_norm, para_norm).ratio()
                if ratio > best_ratio:
                    best_ratio = ratio
                    best_para = para

            # accept fuzzy hit if fairly close
            if not context and best_ratio >= 0.55:
                context = best_para

        else:
            print(f"❌ Markdown file not found: {md_path}")
    else:
        print(f"πŸ” No sentence provided for context lookup")

    print(f"πŸ” Final context length: {len(context)} characters")
    payload = {**work, "context": context}
    return jsonify(payload)


@app.route("/topics", methods=["GET"])
def get_topics():
    if STUB_MODE:
        return jsonify({
            "C52119013": "Art History",
            "T13922": "Historical Art and Culture Studies",
            "T12632": "Visual Culture and Art Theory"
        })
    return jsonify(topic_names)


@app.route("/creators", methods=["GET"])
def get_creators():
    if STUB_MODE:
        return jsonify({
            "arthur_hughes": ["W4206160935", "W2029124454"],
            "francesco_hayez": ["W1982215463", "W4388661114"],
            "george_stubbs": ["W2020798572", "W2021094421"]
        })
    return jsonify(creators)


@app.route("/models", methods=["GET"])
def get_models():
    """
    Return the list of models.
    """
    return jsonify(["CLIP", "PaintingCLIP"])


@app.route("/cell-sim", methods=["GET"])
def cell_similarity():
    if STUB_MODE:
        # Return stub results that match the expected frontend structure
        return jsonify({
            "sentences": [
                {
                    "sentence_id": f"W123_s{i:04d}",
                    "english_original": f"Stub cell sentence {i} for testing.",
                    "work": "W123",
                    "score": 0.9 - i*0.01,
                    "rank": i
                }
                for i in range(1, 6)
            ]
        })
    
    try:
        run_id = request.args["runId"]
        row = int(request.args["row"])
        col = int(request.args["col"])
        k = int(request.args.get("k", 25))

        # Get the run info to retrieve filtering parameters
        run_info = RUNS.get(run_id, {})
        topics = run_info.get("topics", [])
        creators = run_info.get("creators", [])
        model = run_info.get("model", "paintingclip").lower()  # Convert to lowercase

        img_path = ARTIFACTS_DIR / f"{run_id}.jpg"
        if not img_path.exists():
            return jsonify({"error": "Image not found"}), 404
            
        results = inference.run_inference(
            str(img_path),
            cell=(row, col),
            top_k=k,
            filter_topics=topics,
            filter_creators=creators,
            model_type=model,
        )
        return jsonify(results)
    except Exception as e:
        print(f"❌ Error in cell_similarity: {e}")
        return jsonify({"error": str(e)}), 500


# --------------------------------------------------------------------------- #
#  Accurate Grad-ECLIP heat-map                                              #
# --------------------------------------------------------------------------- #
@app.route("/heatmap", methods=["POST"])
def heatmap():
    """
    Body:
        {
            "runId":   "...",
            "sentence": "Full English Original text …",
            "layerIdx": -1          # optional, defaults to last block
        }

    Response:
        { "dataUrl": "data:image/png;base64,..." }
    """
    if STUB_MODE:
        # Return a stub heatmap for Phase 1
        return jsonify({"dataUrl": ""})
    
    payload = request.get_json(force=True)
    run_id = payload["runId"]
    sentence = payload["sentence"]
    layer = int(payload.get("layerIdx", -1))

    # Truncate sentence if it's too long for CLIP (max 77 tokens)
    MAX_SENTENCE_LENGTH = 300
    if len(sentence) > MAX_SENTENCE_LENGTH:
        sentence = sentence[: MAX_SENTENCE_LENGTH - 3] + "..."

    # Path of the already-uploaded artefact
    img_path = ARTIFACTS_DIR / f"{run_id}.jpg"
    if not img_path.exists():
        return jsonify({"error": "image-not-found"}), 404

    try:
        data_url = compute_heatmap(str(img_path), sentence, layer_idx=layer)
        return jsonify({"dataUrl": data_url})
    except Exception as exc:
        print(f"Heatmap generation error: {exc}")
        return jsonify({"error": str(exc)}), 500


# --------------------------------------------------------------------------- #
#  NEW:  marker-output image helpers                                          #
# --------------------------------------------------------------------------- #
@app.route("/marker/<work_id>/<path:filename>", methods=["GET"])
def serve_marker_image(work_id: str, filename: str):
    """
    Static file server for data/marker_output/<work_id>/images/<filename>
    Falls back to work_id/<filename> if images directory doesn't exist
    """
    work_dir = get_markdown_dir() / work_id
    img_dir = work_dir / "images"
    img_path = img_dir / filename
    
    # Try images directory first
    if img_path.exists():
        mime, _ = guess_type(filename)
        return send_from_directory(img_dir, filename, mimetype=mime)
    
    # Fallback: try work directory directly
    work_img_path = work_dir / filename
    if work_img_path.exists():
        mime, _ = guess_type(filename)
        return send_from_directory(work_dir, filename, mimetype=mime)
    
    return jsonify({"error": "not-found"}), 404


# --------------------------------------------------------------------------- #
#  NEW:  paper file helpers                                                  #
# --------------------------------------------------------------------------- #
@app.route("/paper/<path:filename>")
def serve_paper(filename: str):
    """Serve paper files."""
    try:
        paper_dir = Path(__file__).parent.parent.parent / "frontend" / "paper"
        if not paper_dir.exists():
            return "Paper directory not found", 404
        
        file_path = paper_dir / filename
        if not file_path.exists():
            return f"Paper file {filename} not found", 404
            
        return send_from_directory(paper_dir, filename)
    except Exception as e:
        print(f"Error serving paper file {filename}: {e}")
        return "Internal server error", 500


# --------------------------------------------------------------------------- #
#  Error Handlers                                                             #
# --------------------------------------------------------------------------- #
@app.errorhandler(413)  # Payload too large
def too_large(e):
    return jsonify({"error": "File too large for HF Spaces"}), 413

@app.errorhandler(500)
def internal_error(e):
    return jsonify({"error": "Internal server error"}), 500

# --------------------------------------------------------------------------- #
#  Markdown Cache Management Endpoints                                        #
# --------------------------------------------------------------------------- #

@app.route("/cache/info", methods=["GET"])
def cache_info():
    """Get information about the markdown cache"""
    try:
        info = get_markdown_cache_info()
        return jsonify(info)
    except Exception as e:
        return jsonify({"error": str(e)}), 500

@app.route("/cache/clear", methods=["POST"])
def cache_clear():
    """Clear the markdown cache to force re-download"""
    try:
        success = clear_markdown_cache()
        if success:
            return jsonify({"message": "Cache cleared successfully"})
        else:
            return jsonify({"error": "Failed to clear cache"}), 500
    except Exception as e:
        return jsonify({"error": str(e)}), 500

@app.route("/cache/refresh", methods=["POST"])
def cache_refresh():
    """Force refresh the markdown dataset"""
    try:
        # Clear cache and force re-download
        clear_markdown_cache()
        markdown_dir = get_markdown_dir(force_refresh=True)
        
        if markdown_dir and markdown_dir.exists():
            cache_info = get_markdown_cache_info()
            return jsonify({
                "message": "Cache refresh initiated successfully",
                "cache_info": cache_info
            })
        else:
            return jsonify({"error": "Failed to refresh cache"}), 500
    except Exception as e:
        return jsonify({"error": str(e)}), 500

@app.route("/cache/optimized-download", methods=["POST"])
def cache_optimized_download():
    """Start optimized markdown dataset download with parallel processing"""
    try:
        from .config import _download_markdown_optimized
        
        # Clear cache first
        clear_markdown_cache()
        
        # Get the works directory
        markdown_cache_dir = WRITE_ROOT / "markdown_cache"
        works_dir = markdown_cache_dir / "works"
        
        # Start optimized download
        print("πŸš€ Starting optimized markdown download...")
        result = _download_markdown_optimized(works_dir)
        
        if result and result.exists():
            cache_info = get_markdown_cache_info()
            return jsonify({
                "message": "Optimized download completed successfully",
                "cache_info": cache_info
            })
        else:
            return jsonify({"error": "Optimized download failed"}), 500
    except Exception as e:
        return jsonify({"error": str(e)}), 500

# --------------------------------------------------------------------------- #
if __name__ == "__main__":  # invoked via  python -m …
    # Use PORT environment variable for Hugging Face Spaces
    port = int(os.getenv("PORT", 7860))  # Default to 7860 for HF Spaces
    app.run(host="0.0.0.0", port=port, debug=False)