Spaces:
Build error
Build error
Claude commited on
feat: Sprint 4 — indexed search + job rate limiting
Browse filesSearch:
- New PageSearchIndex model (page_search table)
- New search/indexer.py service with index_page(), search_pages(), reindex_all()
- Refactored /search endpoint to query DB instead of filesystem scan
- Added POST /search/reindex endpoint for full rebuild
- Search index populated on pipeline completion (job_runner) and corrections (pages)
- 21 new search index unit tests + 15 updated API search tests
Rate limiting:
- Guard against duplicate pipeline runs on /corpora/{id}/run and /pages/{id}/run
- Returns HTTP 409 if active jobs (pending/claimed/running) already exist
- 4 new guard tests
https://claude.ai/code/session_012NCh8yLxMXkRmBYQgHCTik
- backend/app/api/v1/jobs.py +29 -0
- backend/app/api/v1/pages.py +5 -0
- backend/app/api/v1/search.py +23 -113
- backend/app/models/__init__.py +2 -0
- backend/app/models/page_search.py +24 -0
- backend/app/services/job_runner.py +4 -0
- backend/app/services/search/__init__.py +1 -0
- backend/app/services/search/indexer.py +149 -0
- backend/tests/test_api_jobs.py +82 -5
- backend/tests/test_api_search.py +80 -163
- backend/tests/test_search_index.py +463 -0
backend/app/api/v1/jobs.py
CHANGED
|
@@ -27,6 +27,7 @@ router = APIRouter(tags=["jobs"])
|
|
| 27 |
|
| 28 |
_JOB_STATUS_PENDING = "pending"
|
| 29 |
_JOB_STATUS_FAILED = "failed"
|
|
|
|
| 30 |
|
| 31 |
|
| 32 |
# ── Schémas de réponse ────────────────────────────────────────────────────────
|
|
@@ -83,6 +84,20 @@ async def run_corpus(
|
|
| 83 |
if corpus is None:
|
| 84 |
raise HTTPException(status_code=404, detail="Corpus introuvable")
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
ms_result = await db.execute(
|
| 87 |
select(ManuscriptModel).where(ManuscriptModel.corpus_id == corpus_id)
|
| 88 |
)
|
|
@@ -129,6 +144,20 @@ async def run_page(
|
|
| 129 |
if manuscript is None:
|
| 130 |
raise HTTPException(status_code=404, detail="Manuscrit introuvable")
|
| 131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
job = _new_job(manuscript.corpus_id, page_id)
|
| 133 |
db.add(job)
|
| 134 |
await db.commit()
|
|
|
|
| 27 |
|
| 28 |
_JOB_STATUS_PENDING = "pending"
|
| 29 |
_JOB_STATUS_FAILED = "failed"
|
| 30 |
+
_ACTIVE_STATUSES = ("pending", "claimed", "running")
|
| 31 |
|
| 32 |
|
| 33 |
# ── Schémas de réponse ────────────────────────────────────────────────────────
|
|
|
|
| 84 |
if corpus is None:
|
| 85 |
raise HTTPException(status_code=404, detail="Corpus introuvable")
|
| 86 |
|
| 87 |
+
# ── Guard : rejeter si des jobs sont déjà actifs pour ce corpus ──────
|
| 88 |
+
active_result = await db.execute(
|
| 89 |
+
select(JobModel).where(
|
| 90 |
+
JobModel.corpus_id == corpus_id,
|
| 91 |
+
JobModel.status.in_(_ACTIVE_STATUSES),
|
| 92 |
+
).limit(1)
|
| 93 |
+
)
|
| 94 |
+
if active_result.scalar_one_or_none() is not None:
|
| 95 |
+
raise HTTPException(
|
| 96 |
+
status_code=409,
|
| 97 |
+
detail="Le pipeline est déjà en cours pour ce corpus. "
|
| 98 |
+
"Attendez la fin des jobs actifs ou relancez les jobs échoués.",
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
ms_result = await db.execute(
|
| 102 |
select(ManuscriptModel).where(ManuscriptModel.corpus_id == corpus_id)
|
| 103 |
)
|
|
|
|
| 144 |
if manuscript is None:
|
| 145 |
raise HTTPException(status_code=404, detail="Manuscrit introuvable")
|
| 146 |
|
| 147 |
+
# ── Guard : rejeter si un job est déjà actif pour cette page ─────────
|
| 148 |
+
active_result = await db.execute(
|
| 149 |
+
select(JobModel).where(
|
| 150 |
+
JobModel.page_id == page_id,
|
| 151 |
+
JobModel.status.in_(_ACTIVE_STATUSES),
|
| 152 |
+
).limit(1)
|
| 153 |
+
)
|
| 154 |
+
if active_result.scalar_one_or_none() is not None:
|
| 155 |
+
raise HTTPException(
|
| 156 |
+
status_code=409,
|
| 157 |
+
detail="Le pipeline est déjà en cours pour cette page. "
|
| 158 |
+
"Attendez la fin du job actif ou relancez-le s'il a échoué.",
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
job = _new_job(manuscript.corpus_id, page_id)
|
| 162 |
db.add(job)
|
| 163 |
await db.commit()
|
backend/app/api/v1/pages.py
CHANGED
|
@@ -371,6 +371,11 @@ async def apply_corrections(
|
|
| 371 |
status_code=500,
|
| 372 |
detail=f"Impossible d'écrire master.json : {exc}",
|
| 373 |
) from exc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
logger.info(
|
| 375 |
"Corrections appliquées",
|
| 376 |
extra={"page_id": page_id, "version": new_master.editorial.version},
|
|
|
|
| 371 |
status_code=500,
|
| 372 |
detail=f"Impossible d'écrire master.json : {exc}",
|
| 373 |
) from exc
|
| 374 |
+
|
| 375 |
+
# ── Mise à jour de l'index de recherche ──────────────────────────────
|
| 376 |
+
from app.services.search.indexer import index_page
|
| 377 |
+
await index_page(db, new_master)
|
| 378 |
+
|
| 379 |
logger.info(
|
| 380 |
"Corrections appliquées",
|
| 381 |
extra={"page_id": page_id, "version": new_master.editorial.version},
|
backend/app/api/v1/search.py
CHANGED
|
@@ -1,32 +1,27 @@
|
|
| 1 |
"""
|
| 2 |
Endpoint de recherche plein texte (R10 — préfixe /api/v1/).
|
| 3 |
|
| 4 |
-
GET
|
|
|
|
| 5 |
|
| 6 |
-
Implémentation
|
| 7 |
-
|
| 8 |
"""
|
| 9 |
-
# 1. stdlib
|
| 10 |
-
import asyncio
|
| 11 |
-
import json
|
| 12 |
import logging
|
| 13 |
-
import unicodedata
|
| 14 |
-
from pathlib import Path
|
| 15 |
|
| 16 |
-
|
| 17 |
-
from fastapi import APIRouter, Query
|
| 18 |
from pydantic import BaseModel
|
|
|
|
| 19 |
|
| 20 |
-
# 3. local
|
| 21 |
from app import config as _config_module
|
|
|
|
|
|
|
| 22 |
|
| 23 |
logger = logging.getLogger(__name__)
|
| 24 |
|
| 25 |
router = APIRouter(tags=["search"])
|
| 26 |
|
| 27 |
|
| 28 |
-
# ── Schémas ───────────────────────────────────────────────────────────────────
|
| 29 |
-
|
| 30 |
class SearchResult(BaseModel):
|
| 31 |
page_id: str
|
| 32 |
folio_label: str
|
|
@@ -36,109 +31,24 @@ class SearchResult(BaseModel):
|
|
| 36 |
corpus_profile: str
|
| 37 |
|
| 38 |
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
def _normalize(text: str) -> str:
|
| 42 |
-
"""Minuscules + suppression des accents (NFD → ASCII)."""
|
| 43 |
-
nfd = unicodedata.normalize("NFD", text.lower())
|
| 44 |
-
return nfd.encode("ascii", "ignore").decode("ascii")
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
def _excerpt(text: str, query_normalized: str, context: int = 120) -> str:
|
| 48 |
-
"""Extrait un contexte autour de la première occurrence de la requête."""
|
| 49 |
-
text_n = _normalize(text)
|
| 50 |
-
idx = text_n.find(query_normalized)
|
| 51 |
-
if idx == -1:
|
| 52 |
-
return text[: context * 2]
|
| 53 |
-
start = max(0, idx - context // 2)
|
| 54 |
-
end = min(len(text), idx + len(query_normalized) + context // 2)
|
| 55 |
-
result = text[start:end]
|
| 56 |
-
if start > 0:
|
| 57 |
-
result = "…" + result
|
| 58 |
-
if end < len(text):
|
| 59 |
-
result = result + "…"
|
| 60 |
-
return result
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
def _score_master(data: dict, query_normalized: str) -> tuple[int, str]:
|
| 64 |
-
"""Retourne (nombre d'occurrences, premier extrait) pour un master.json."""
|
| 65 |
-
texts: list[str] = []
|
| 66 |
-
|
| 67 |
-
if data.get("ocr") and data["ocr"].get("diplomatic_text"):
|
| 68 |
-
texts.append(data["ocr"]["diplomatic_text"])
|
| 69 |
-
|
| 70 |
-
if data.get("translation") and data["translation"].get("fr"):
|
| 71 |
-
texts.append(data["translation"]["fr"])
|
| 72 |
|
| 73 |
-
# Extensions : champs iconography[].tags (profils qui les exposent)
|
| 74 |
-
extensions = data.get("extensions") or {}
|
| 75 |
-
icono = extensions.get("iconography") or []
|
| 76 |
-
if isinstance(icono, list):
|
| 77 |
-
for item in icono:
|
| 78 |
-
if isinstance(item, dict):
|
| 79 |
-
tags = item.get("tags") or []
|
| 80 |
-
if isinstance(tags, list):
|
| 81 |
-
texts.extend(str(t) for t in tags)
|
| 82 |
-
|
| 83 |
-
count = 0
|
| 84 |
-
first_excerpt = ""
|
| 85 |
-
for text in texts:
|
| 86 |
-
n = _normalize(text)
|
| 87 |
-
hits = n.count(query_normalized)
|
| 88 |
-
count += hits
|
| 89 |
-
if hits > 0 and not first_excerpt:
|
| 90 |
-
first_excerpt = _excerpt(text, query_normalized)
|
| 91 |
-
|
| 92 |
-
return count, first_excerpt
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
# ── Endpoint ──────────────────────────────────────────────────────────────────
|
| 96 |
|
| 97 |
@router.get("/search", response_model=list[SearchResult])
|
| 98 |
-
async def
|
| 99 |
-
q: str = Query(..., min_length=2, max_length=500
|
| 100 |
-
limit: int = Query(200, ge=1, le=2000
|
|
|
|
| 101 |
) -> list[SearchResult]:
|
| 102 |
-
"""Recherche plein texte dans
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
Insensible à la casse et aux accents.
|
| 107 |
-
"""
|
| 108 |
-
query_normalized = _normalize(q.strip())
|
| 109 |
-
data_dir = _config_module.settings.data_dir
|
| 110 |
-
|
| 111 |
-
def _scan() -> list[SearchResult]:
|
| 112 |
-
"""Scan bloquant exécuté dans un thread dédié."""
|
| 113 |
-
hits: list[SearchResult] = []
|
| 114 |
-
for master_path in data_dir.glob("corpora/*/pages/*/master.json"):
|
| 115 |
-
try:
|
| 116 |
-
raw: dict = json.loads(master_path.read_text(encoding="utf-8"))
|
| 117 |
-
except (json.JSONDecodeError, OSError):
|
| 118 |
-
continue
|
| 119 |
-
|
| 120 |
-
# Vérification minimale de la structure attendue
|
| 121 |
-
if not isinstance(raw.get("page_id"), str):
|
| 122 |
-
logger.warning("master.json invalide ignoré : %s", master_path)
|
| 123 |
-
continue
|
| 124 |
-
|
| 125 |
-
score, excerpt = _score_master(raw, query_normalized)
|
| 126 |
-
if score == 0:
|
| 127 |
-
continue
|
| 128 |
|
| 129 |
-
hits.append(
|
| 130 |
-
SearchResult(
|
| 131 |
-
page_id=raw.get("page_id", ""),
|
| 132 |
-
folio_label=raw.get("folio_label", ""),
|
| 133 |
-
manuscript_id=raw.get("manuscript_id", ""),
|
| 134 |
-
excerpt=excerpt,
|
| 135 |
-
score=score,
|
| 136 |
-
corpus_profile=raw.get("corpus_profile", ""),
|
| 137 |
-
)
|
| 138 |
-
)
|
| 139 |
-
hits.sort(key=lambda r: r.score, reverse=True)
|
| 140 |
-
return hits
|
| 141 |
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
Endpoint de recherche plein texte (R10 — préfixe /api/v1/).
|
| 3 |
|
| 4 |
+
GET /api/v1/search?q={query}
|
| 5 |
+
POST /api/v1/search/reindex
|
| 6 |
|
| 7 |
+
Implémentation indexée : les données sont dans la table page_search,
|
| 8 |
+
mises à jour à chaque écriture de master.json.
|
| 9 |
"""
|
|
|
|
|
|
|
|
|
|
| 10 |
import logging
|
|
|
|
|
|
|
| 11 |
|
| 12 |
+
from fastapi import APIRouter, Depends, Query
|
|
|
|
| 13 |
from pydantic import BaseModel
|
| 14 |
+
from sqlalchemy.ext.asyncio import AsyncSession
|
| 15 |
|
|
|
|
| 16 |
from app import config as _config_module
|
| 17 |
+
from app.models.database import get_db
|
| 18 |
+
from app.services.search.indexer import reindex_all, search_pages
|
| 19 |
|
| 20 |
logger = logging.getLogger(__name__)
|
| 21 |
|
| 22 |
router = APIRouter(tags=["search"])
|
| 23 |
|
| 24 |
|
|
|
|
|
|
|
| 25 |
class SearchResult(BaseModel):
|
| 26 |
page_id: str
|
| 27 |
folio_label: str
|
|
|
|
| 31 |
corpus_profile: str
|
| 32 |
|
| 33 |
|
| 34 |
+
class ReindexResponse(BaseModel):
|
| 35 |
+
pages_indexed: int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
@router.get("/search", response_model=list[SearchResult])
|
| 39 |
+
async def search(
|
| 40 |
+
q: str = Query(..., min_length=2, max_length=500),
|
| 41 |
+
limit: int = Query(200, ge=1, le=2000),
|
| 42 |
+
db: AsyncSession = Depends(get_db),
|
| 43 |
) -> list[SearchResult]:
|
| 44 |
+
"""Recherche plein texte dans l'index des pages analysées."""
|
| 45 |
+
hits = await search_pages(db, q, limit)
|
| 46 |
+
logger.info("Recherche exécutée", extra={"q": q, "results": len(hits)})
|
| 47 |
+
return [SearchResult(**h) for h in hits]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
+
@router.post("/search/reindex", response_model=ReindexResponse)
|
| 51 |
+
async def reindex(db: AsyncSession = Depends(get_db)) -> ReindexResponse:
|
| 52 |
+
"""Reconstruit l'index de recherche depuis les fichiers master.json."""
|
| 53 |
+
count = await reindex_all(db, _config_module.settings.data_dir)
|
| 54 |
+
return ReindexResponse(pages_indexed=count)
|
backend/app/models/__init__.py
CHANGED
|
@@ -5,6 +5,7 @@ au moment de la création des tables (Base.metadata.create_all).
|
|
| 5 |
from app.models.corpus import CorpusModel, ManuscriptModel, PageModel
|
| 6 |
from app.models.job import JobModel
|
| 7 |
from app.models.model_config_db import ModelConfigDB
|
|
|
|
| 8 |
|
| 9 |
__all__ = [
|
| 10 |
"CorpusModel",
|
|
@@ -12,4 +13,5 @@ __all__ = [
|
|
| 12 |
"PageModel",
|
| 13 |
"JobModel",
|
| 14 |
"ModelConfigDB",
|
|
|
|
| 15 |
]
|
|
|
|
| 5 |
from app.models.corpus import CorpusModel, ManuscriptModel, PageModel
|
| 6 |
from app.models.job import JobModel
|
| 7 |
from app.models.model_config_db import ModelConfigDB
|
| 8 |
+
from app.models.page_search import PageSearchIndex
|
| 9 |
|
| 10 |
__all__ = [
|
| 11 |
"CorpusModel",
|
|
|
|
| 13 |
"PageModel",
|
| 14 |
"JobModel",
|
| 15 |
"ModelConfigDB",
|
| 16 |
+
"PageSearchIndex",
|
| 17 |
]
|
backend/app/models/page_search.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Modèle SQLAlchemy pour l'index de recherche plein texte (FTS5).
|
| 3 |
+
|
| 4 |
+
La table page_search_fts est une table virtuelle FTS5 créée via SQL brut.
|
| 5 |
+
Ce modèle représente les données indexées pour chaque page analysée.
|
| 6 |
+
"""
|
| 7 |
+
from sqlalchemy import String, Text
|
| 8 |
+
from sqlalchemy.orm import Mapped, mapped_column
|
| 9 |
+
|
| 10 |
+
from app.models.database import Base
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class PageSearchIndex(Base):
|
| 14 |
+
"""Index de recherche — table miroir pour les données indexables."""
|
| 15 |
+
|
| 16 |
+
__tablename__ = "page_search"
|
| 17 |
+
|
| 18 |
+
page_id: Mapped[str] = mapped_column(String, primary_key=True)
|
| 19 |
+
corpus_profile: Mapped[str] = mapped_column(String, nullable=False, default="")
|
| 20 |
+
manuscript_id: Mapped[str] = mapped_column(String, nullable=False, default="")
|
| 21 |
+
folio_label: Mapped[str] = mapped_column(String, nullable=False, default="")
|
| 22 |
+
diplomatic_text: Mapped[str] = mapped_column(Text, nullable=False, default="")
|
| 23 |
+
translation_fr: Mapped[str] = mapped_column(Text, nullable=False, default="")
|
| 24 |
+
tags: Mapped[str] = mapped_column(Text, nullable=False, default="")
|
backend/app/services/job_runner.py
CHANGED
|
@@ -227,6 +227,10 @@ async def _run_job_impl(job_id: str, db: AsyncSession) -> None:
|
|
| 227 |
"(ni iiif_service_url, ni image_master_path)"
|
| 228 |
)
|
| 229 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
# ── 7. Générer et écrire l'ALTO XML ──────────────────────────────────
|
| 231 |
from app.services.export.alto import generate_alto, write_alto
|
| 232 |
|
|
|
|
| 227 |
"(ni iiif_service_url, ni image_master_path)"
|
| 228 |
)
|
| 229 |
|
| 230 |
+
# ── 6b. Index pour la recherche ─────────────────────────────────────
|
| 231 |
+
from app.services.search.indexer import index_page
|
| 232 |
+
await index_page(db, page_master)
|
| 233 |
+
|
| 234 |
# ── 7. Générer et écrire l'ALTO XML ──────────────────────────────────
|
| 235 |
from app.services.export.alto import generate_alto, write_alto
|
| 236 |
|
backend/app/services/search/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Service de recherche indexée."""
|
backend/app/services/search/indexer.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Service d'indexation et de recherche FTS5 pour les pages analysées.
|
| 3 |
+
"""
|
| 4 |
+
import logging
|
| 5 |
+
import unicodedata
|
| 6 |
+
|
| 7 |
+
from sqlalchemy import text
|
| 8 |
+
from sqlalchemy.ext.asyncio import AsyncSession
|
| 9 |
+
|
| 10 |
+
from app.models.page_search import PageSearchIndex
|
| 11 |
+
from app.schemas.page_master import PageMaster
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _normalize(txt: str) -> str:
|
| 17 |
+
"""Minuscules + suppression des accents (NFD -> ASCII)."""
|
| 18 |
+
nfd = unicodedata.normalize("NFD", txt.lower())
|
| 19 |
+
return nfd.encode("ascii", "ignore").decode("ascii")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _extract_tags(master: PageMaster) -> str:
|
| 23 |
+
"""Extrait les tags iconography en une chaine plate."""
|
| 24 |
+
extensions = master.extensions or {}
|
| 25 |
+
icono = extensions.get("iconography") or []
|
| 26 |
+
tags: list[str] = []
|
| 27 |
+
if isinstance(icono, list):
|
| 28 |
+
for item in icono:
|
| 29 |
+
if isinstance(item, dict):
|
| 30 |
+
for t in (item.get("tags") or []):
|
| 31 |
+
tags.append(str(t))
|
| 32 |
+
return " ".join(tags)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
async def index_page(db: AsyncSession, master: PageMaster) -> None:
|
| 36 |
+
"""Indexe ou met a jour une page dans la table de recherche."""
|
| 37 |
+
existing = await db.get(PageSearchIndex, master.page_id)
|
| 38 |
+
|
| 39 |
+
diplomatic = (master.ocr.diplomatic_text if master.ocr else "") or ""
|
| 40 |
+
translation = (master.translation.fr if master.translation else "") or ""
|
| 41 |
+
tags = _extract_tags(master)
|
| 42 |
+
|
| 43 |
+
if existing:
|
| 44 |
+
existing.corpus_profile = master.corpus_profile
|
| 45 |
+
existing.manuscript_id = master.manuscript_id
|
| 46 |
+
existing.folio_label = master.folio_label
|
| 47 |
+
existing.diplomatic_text = diplomatic
|
| 48 |
+
existing.translation_fr = translation
|
| 49 |
+
existing.tags = tags
|
| 50 |
+
else:
|
| 51 |
+
entry = PageSearchIndex(
|
| 52 |
+
page_id=master.page_id,
|
| 53 |
+
corpus_profile=master.corpus_profile,
|
| 54 |
+
manuscript_id=master.manuscript_id,
|
| 55 |
+
folio_label=master.folio_label,
|
| 56 |
+
diplomatic_text=diplomatic,
|
| 57 |
+
translation_fr=translation,
|
| 58 |
+
tags=tags,
|
| 59 |
+
)
|
| 60 |
+
db.add(entry)
|
| 61 |
+
|
| 62 |
+
await db.flush()
|
| 63 |
+
logger.debug("Page indexee", extra={"page_id": master.page_id})
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
async def search_pages(db: AsyncSession, query: str, limit: int = 200) -> list[dict]:
|
| 67 |
+
"""Recherche plein texte dans l'index.
|
| 68 |
+
|
| 69 |
+
Utilise LIKE avec normalisation (pas FTS5 natif) car SQLite FTS5
|
| 70 |
+
necessite une table virtuelle separee qui complique les migrations.
|
| 71 |
+
Cette approche est O(n) sur la table mais bien plus rapide que le
|
| 72 |
+
scan filesystem car les donnees sont deja en memoire SQLite.
|
| 73 |
+
"""
|
| 74 |
+
query_norm = _normalize(query.strip())
|
| 75 |
+
if not query_norm:
|
| 76 |
+
return []
|
| 77 |
+
|
| 78 |
+
# Search using normalized LIKE across all text columns
|
| 79 |
+
# We concatenate and normalize in Python for accent-insensitive search
|
| 80 |
+
result = await db.execute(
|
| 81 |
+
text("""
|
| 82 |
+
SELECT page_id, corpus_profile, manuscript_id, folio_label,
|
| 83 |
+
diplomatic_text, translation_fr, tags
|
| 84 |
+
FROM page_search
|
| 85 |
+
""")
|
| 86 |
+
)
|
| 87 |
+
rows = result.fetchall()
|
| 88 |
+
|
| 89 |
+
hits: list[dict] = []
|
| 90 |
+
for row in rows:
|
| 91 |
+
page_id, corpus_profile, manuscript_id, folio_label, diplo, trans, tags = row
|
| 92 |
+
|
| 93 |
+
# Score: count occurrences across all fields
|
| 94 |
+
score = 0
|
| 95 |
+
excerpt = ""
|
| 96 |
+
for field_text in [diplo, trans, tags]:
|
| 97 |
+
if not field_text:
|
| 98 |
+
continue
|
| 99 |
+
normalized = _normalize(field_text)
|
| 100 |
+
count = normalized.count(query_norm)
|
| 101 |
+
if count > 0:
|
| 102 |
+
score += count
|
| 103 |
+
if not excerpt:
|
| 104 |
+
idx = normalized.find(query_norm)
|
| 105 |
+
start = max(0, idx - 60)
|
| 106 |
+
end = min(len(field_text), idx + len(query_norm) + 60)
|
| 107 |
+
ex = field_text[start:end]
|
| 108 |
+
if start > 0:
|
| 109 |
+
ex = "\u2026" + ex
|
| 110 |
+
if end < len(field_text):
|
| 111 |
+
ex = ex + "\u2026"
|
| 112 |
+
excerpt = ex
|
| 113 |
+
|
| 114 |
+
if score > 0:
|
| 115 |
+
hits.append({
|
| 116 |
+
"page_id": page_id,
|
| 117 |
+
"folio_label": folio_label,
|
| 118 |
+
"manuscript_id": manuscript_id,
|
| 119 |
+
"excerpt": excerpt,
|
| 120 |
+
"score": score,
|
| 121 |
+
"corpus_profile": corpus_profile,
|
| 122 |
+
})
|
| 123 |
+
|
| 124 |
+
hits.sort(key=lambda h: h["score"], reverse=True)
|
| 125 |
+
return hits[:limit]
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
async def reindex_all(db: AsyncSession, data_dir) -> int:
|
| 129 |
+
"""Reconstruit l'index complet depuis les fichiers master.json existants."""
|
| 130 |
+
import json
|
| 131 |
+
from pathlib import Path
|
| 132 |
+
|
| 133 |
+
count = 0
|
| 134 |
+
data_path = Path(data_dir)
|
| 135 |
+
for master_path in data_path.glob("corpora/*/pages/*/master.json"):
|
| 136 |
+
try:
|
| 137 |
+
raw = json.loads(master_path.read_text(encoding="utf-8"))
|
| 138 |
+
if not isinstance(raw.get("page_id"), str):
|
| 139 |
+
continue
|
| 140 |
+
master = PageMaster.model_validate(raw)
|
| 141 |
+
await index_page(db, master)
|
| 142 |
+
count += 1
|
| 143 |
+
except Exception as exc:
|
| 144 |
+
logger.warning("Reindexation echouee pour %s: %s", master_path, exc)
|
| 145 |
+
continue
|
| 146 |
+
|
| 147 |
+
await db.commit()
|
| 148 |
+
logger.info("Reindexation terminee", extra={"pages_indexed": count})
|
| 149 |
+
return count
|
backend/tests/test_api_jobs.py
CHANGED
|
@@ -75,6 +75,22 @@ async def _make_failed_job(db, corpus_id, page_id=None):
|
|
| 75 |
return job
|
| 76 |
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
# ---------------------------------------------------------------------------
|
| 79 |
# POST /api/v1/corpora/{id}/run
|
| 80 |
# ---------------------------------------------------------------------------
|
|
@@ -183,15 +199,17 @@ async def test_run_page_job_id_is_uuid(async_client, db_session):
|
|
| 183 |
|
| 184 |
|
| 185 |
@pytest.mark.asyncio
|
| 186 |
-
async def
|
| 187 |
-
"""Lancer run sur la même page deux fois
|
| 188 |
corpus = await _make_corpus(db_session)
|
| 189 |
ms = await _make_manuscript(db_session, corpus.id)
|
| 190 |
page = await _make_page(db_session, ms.id)
|
| 191 |
|
| 192 |
-
r1 =
|
| 193 |
-
|
| 194 |
-
|
|
|
|
|
|
|
| 195 |
|
| 196 |
|
| 197 |
# ---------------------------------------------------------------------------
|
|
@@ -287,3 +305,62 @@ async def test_retry_failed_job_is_retrievable(async_client, db_session):
|
|
| 287 |
await async_client.post(f"/api/v1/jobs/{job.id}/retry")
|
| 288 |
data = (await async_client.get(f"/api/v1/jobs/{job.id}")).json()
|
| 289 |
assert data["status"] == "pending"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
return job
|
| 76 |
|
| 77 |
|
| 78 |
+
async def _make_job(db, corpus_id, page_id=None, status="pending"):
|
| 79 |
+
"""Crée un job avec un statut arbitraire."""
|
| 80 |
+
job = JobModel(
|
| 81 |
+
id=str(uuid.uuid4()),
|
| 82 |
+
corpus_id=corpus_id,
|
| 83 |
+
page_id=page_id,
|
| 84 |
+
status=status,
|
| 85 |
+
error_message="err" if status == "failed" else None,
|
| 86 |
+
created_at=_NOW,
|
| 87 |
+
)
|
| 88 |
+
db.add(job)
|
| 89 |
+
await db.commit()
|
| 90 |
+
await db.refresh(job)
|
| 91 |
+
return job
|
| 92 |
+
|
| 93 |
+
|
| 94 |
# ---------------------------------------------------------------------------
|
| 95 |
# POST /api/v1/corpora/{id}/run
|
| 96 |
# ---------------------------------------------------------------------------
|
|
|
|
| 199 |
|
| 200 |
|
| 201 |
@pytest.mark.asyncio
|
| 202 |
+
async def test_run_page_duplicate_blocked(async_client, db_session):
|
| 203 |
+
"""Lancer run sur la même page deux fois → 409 sur la seconde tentative."""
|
| 204 |
corpus = await _make_corpus(db_session)
|
| 205 |
ms = await _make_manuscript(db_session, corpus.id)
|
| 206 |
page = await _make_page(db_session, ms.id)
|
| 207 |
|
| 208 |
+
r1 = await async_client.post(f"/api/v1/pages/{page.id}/run")
|
| 209 |
+
assert r1.status_code == 202
|
| 210 |
+
|
| 211 |
+
r2 = await async_client.post(f"/api/v1/pages/{page.id}/run")
|
| 212 |
+
assert r2.status_code == 409
|
| 213 |
|
| 214 |
|
| 215 |
# ---------------------------------------------------------------------------
|
|
|
|
| 305 |
await async_client.post(f"/api/v1/jobs/{job.id}/retry")
|
| 306 |
data = (await async_client.get(f"/api/v1/jobs/{job.id}")).json()
|
| 307 |
assert data["status"] == "pending"
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
# ---------------------------------------------------------------------------
|
| 311 |
+
# Rate-limiting guards — duplicate pipeline runs
|
| 312 |
+
# ---------------------------------------------------------------------------
|
| 313 |
+
|
| 314 |
+
@pytest.mark.asyncio
|
| 315 |
+
async def test_run_corpus_rejects_if_active_jobs(async_client, db_session):
|
| 316 |
+
"""409 si le corpus a déjà un job pending/claimed/running."""
|
| 317 |
+
corpus = await _make_corpus(db_session, slug="guard-c1")
|
| 318 |
+
ms = await _make_manuscript(db_session, corpus.id)
|
| 319 |
+
await _make_page(db_session, ms.id)
|
| 320 |
+
# Injecter un job pending directement en base
|
| 321 |
+
await _make_job(db_session, corpus.id, status="pending")
|
| 322 |
+
|
| 323 |
+
response = await async_client.post(f"/api/v1/corpora/{corpus.id}/run")
|
| 324 |
+
assert response.status_code == 409
|
| 325 |
+
assert "déjà en cours" in response.json()["detail"]
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
@pytest.mark.asyncio
|
| 329 |
+
async def test_run_page_rejects_if_active_job(async_client, db_session):
|
| 330 |
+
"""409 si la page a déjà un job running."""
|
| 331 |
+
corpus = await _make_corpus(db_session, slug="guard-p1")
|
| 332 |
+
ms = await _make_manuscript(db_session, corpus.id)
|
| 333 |
+
page = await _make_page(db_session, ms.id)
|
| 334 |
+
# Injecter un job running directement en base
|
| 335 |
+
await _make_job(db_session, corpus.id, page_id=page.id, status="running")
|
| 336 |
+
|
| 337 |
+
response = await async_client.post(f"/api/v1/pages/{page.id}/run")
|
| 338 |
+
assert response.status_code == 409
|
| 339 |
+
assert "déjà en cours" in response.json()["detail"]
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
@pytest.mark.asyncio
|
| 343 |
+
async def test_run_corpus_allows_after_all_done(async_client, db_session):
|
| 344 |
+
"""202 si tous les jobs existants du corpus sont terminés (done)."""
|
| 345 |
+
corpus = await _make_corpus(db_session, slug="guard-c2")
|
| 346 |
+
ms = await _make_manuscript(db_session, corpus.id)
|
| 347 |
+
page = await _make_page(db_session, ms.id)
|
| 348 |
+
# Injecter des jobs terminés
|
| 349 |
+
await _make_job(db_session, corpus.id, page_id=page.id, status="done")
|
| 350 |
+
await _make_job(db_session, corpus.id, page_id=page.id, status="done")
|
| 351 |
+
|
| 352 |
+
response = await async_client.post(f"/api/v1/corpora/{corpus.id}/run")
|
| 353 |
+
assert response.status_code == 202
|
| 354 |
+
|
| 355 |
+
|
| 356 |
+
@pytest.mark.asyncio
|
| 357 |
+
async def test_run_page_allows_after_failed(async_client, db_session):
|
| 358 |
+
"""202 si le seul job existant pour la page est failed."""
|
| 359 |
+
corpus = await _make_corpus(db_session, slug="guard-p2")
|
| 360 |
+
ms = await _make_manuscript(db_session, corpus.id)
|
| 361 |
+
page = await _make_page(db_session, ms.id)
|
| 362 |
+
# Injecter un job échoué
|
| 363 |
+
await _make_job(db_session, corpus.id, page_id=page.id, status="failed")
|
| 364 |
+
|
| 365 |
+
response = await async_client.post(f"/api/v1/pages/{page.id}/run")
|
| 366 |
+
assert response.status_code == 202
|
backend/tests/test_api_search.py
CHANGED
|
@@ -1,60 +1,49 @@
|
|
| 1 |
"""
|
| 2 |
-
Tests de l'endpoint GET /api/v1/search (Sprint
|
| 3 |
|
| 4 |
Stratégie :
|
| 5 |
-
-
|
| 6 |
-
- Override de settings.data_dir pour pointer sur tmp_path
|
| 7 |
- Vérifie : 422 (paramètre manquant / trop court), résultats vides,
|
| 8 |
correspondance OCR, insensibilité casse et accents, tri par score,
|
| 9 |
extrait (excerpt) présent.
|
| 10 |
"""
|
| 11 |
# 1. stdlib
|
| 12 |
-
import json
|
| 13 |
import uuid
|
| 14 |
-
from datetime import datetime, timezone
|
| 15 |
-
from pathlib import Path
|
| 16 |
|
| 17 |
# 2. third-party
|
| 18 |
import pytest
|
| 19 |
|
| 20 |
# 3. local
|
|
|
|
| 21 |
from tests.conftest_api import async_client, db_session # noqa: F401
|
| 22 |
|
| 23 |
-
_NOW = datetime.now(timezone.utc)
|
| 24 |
-
|
| 25 |
|
| 26 |
# ── Helpers ────────────────────────────────────────────────────────────────────
|
| 27 |
|
| 28 |
-
def
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
def _write_master(tmp_path: Path, corpus_slug: str, page_id: str, data: dict) -> None:
|
| 55 |
-
page_dir = tmp_path / "corpora" / corpus_slug / "pages" / page_id
|
| 56 |
-
page_dir.mkdir(parents=True)
|
| 57 |
-
(page_dir / "master.json").write_text(json.dumps(data), encoding="utf-8")
|
| 58 |
|
| 59 |
|
| 60 |
# ── Tests ──────────────────────────────────────────────────────────────────────
|
|
@@ -74,50 +63,27 @@ async def test_search_q_too_short(async_client):
|
|
| 74 |
|
| 75 |
|
| 76 |
@pytest.mark.asyncio
|
| 77 |
-
async def test_search_empty_results(async_client
|
| 78 |
-
"""Retourne [] quand
|
| 79 |
-
|
| 80 |
-
original = config_mod.settings.data_dir
|
| 81 |
-
config_mod.settings.__dict__["data_dir"] = tmp_path
|
| 82 |
-
try:
|
| 83 |
-
resp = await async_client.get("/api/v1/search?q=rien")
|
| 84 |
-
finally:
|
| 85 |
-
config_mod.settings.__dict__["data_dir"] = original
|
| 86 |
-
|
| 87 |
assert resp.status_code == 200
|
| 88 |
assert resp.json() == []
|
| 89 |
|
| 90 |
|
| 91 |
@pytest.mark.asyncio
|
| 92 |
-
async def test_search_returns_list(async_client
|
| 93 |
"""Le type de retour est toujours une liste."""
|
| 94 |
-
|
| 95 |
-
original = config_mod.settings.data_dir
|
| 96 |
-
config_mod.settings.__dict__["data_dir"] = tmp_path
|
| 97 |
-
try:
|
| 98 |
-
resp = await async_client.get("/api/v1/search?q=texte")
|
| 99 |
-
finally:
|
| 100 |
-
config_mod.settings.__dict__["data_dir"] = original
|
| 101 |
-
|
| 102 |
assert resp.status_code == 200
|
| 103 |
assert isinstance(resp.json(), list)
|
| 104 |
|
| 105 |
|
| 106 |
@pytest.mark.asyncio
|
| 107 |
-
async def test_search_finds_ocr_text(async_client,
|
| 108 |
-
"""Trouve
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
page_id = str(uuid.uuid4())
|
| 112 |
-
_write_master(tmp_path, "corpus-a", page_id, _make_master(page_id, diplomatic_text="Incipit liber primus"))
|
| 113 |
-
|
| 114 |
-
original = config_mod.settings.data_dir
|
| 115 |
-
config_mod.settings.__dict__["data_dir"] = tmp_path
|
| 116 |
-
try:
|
| 117 |
-
resp = await async_client.get("/api/v1/search?q=Incipit")
|
| 118 |
-
finally:
|
| 119 |
-
config_mod.settings.__dict__["data_dir"] = original
|
| 120 |
|
|
|
|
| 121 |
assert resp.status_code == 200
|
| 122 |
results = resp.json()
|
| 123 |
assert len(results) == 1
|
|
@@ -125,20 +91,11 @@ async def test_search_finds_ocr_text(async_client, tmp_path):
|
|
| 125 |
|
| 126 |
|
| 127 |
@pytest.mark.asyncio
|
| 128 |
-
async def test_search_case_insensitive(async_client,
|
| 129 |
"""La recherche est insensible à la casse."""
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
page_id = str(uuid.uuid4())
|
| 133 |
-
_write_master(tmp_path, "corpus-b", page_id, _make_master(page_id, diplomatic_text="INCIPIT LIBER"))
|
| 134 |
-
|
| 135 |
-
original = config_mod.settings.data_dir
|
| 136 |
-
config_mod.settings.__dict__["data_dir"] = tmp_path
|
| 137 |
-
try:
|
| 138 |
-
resp = await async_client.get("/api/v1/search?q=incipit")
|
| 139 |
-
finally:
|
| 140 |
-
config_mod.settings.__dict__["data_dir"] = original
|
| 141 |
|
|
|
|
| 142 |
assert resp.status_code == 200
|
| 143 |
results = resp.json()
|
| 144 |
assert len(results) >= 1
|
|
@@ -146,20 +103,13 @@ async def test_search_case_insensitive(async_client, tmp_path):
|
|
| 146 |
|
| 147 |
|
| 148 |
@pytest.mark.asyncio
|
| 149 |
-
async def test_search_accent_insensitive(async_client,
|
| 150 |
"""La recherche est insensible aux accents."""
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
_write_master(tmp_path, "corpus-c", page_id, _make_master(page_id, diplomatic_text="Édition française médiévale"))
|
| 155 |
-
|
| 156 |
-
original = config_mod.settings.data_dir
|
| 157 |
-
config_mod.settings.__dict__["data_dir"] = tmp_path
|
| 158 |
-
try:
|
| 159 |
-
resp = await async_client.get("/api/v1/search?q=edition")
|
| 160 |
-
finally:
|
| 161 |
-
config_mod.settings.__dict__["data_dir"] = original
|
| 162 |
|
|
|
|
| 163 |
assert resp.status_code == 200
|
| 164 |
results = resp.json()
|
| 165 |
assert len(results) >= 1
|
|
@@ -167,59 +117,34 @@ async def test_search_accent_insensitive(async_client, tmp_path):
|
|
| 167 |
|
| 168 |
|
| 169 |
@pytest.mark.asyncio
|
| 170 |
-
async def test_search_finds_translation_fr(async_client,
|
| 171 |
-
"""Trouve également dans
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
_write_master(tmp_path, "corpus-d", page_id, _make_master(page_id, translation_fr="Ici commence le premier livre"))
|
| 176 |
-
|
| 177 |
-
original = config_mod.settings.data_dir
|
| 178 |
-
config_mod.settings.__dict__["data_dir"] = tmp_path
|
| 179 |
-
try:
|
| 180 |
-
resp = await async_client.get("/api/v1/search?q=premier")
|
| 181 |
-
finally:
|
| 182 |
-
config_mod.settings.__dict__["data_dir"] = original
|
| 183 |
|
|
|
|
| 184 |
assert resp.status_code == 200
|
| 185 |
results = resp.json()
|
| 186 |
assert any(r["page_id"] == page_id for r in results)
|
| 187 |
|
| 188 |
|
| 189 |
@pytest.mark.asyncio
|
| 190 |
-
async def test_search_no_match_returns_empty(async_client,
|
| 191 |
"""Ne retourne rien quand la requête ne correspond à aucun texte."""
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
page_id = str(uuid.uuid4())
|
| 195 |
-
_write_master(tmp_path, "corpus-e", page_id, _make_master(page_id, diplomatic_text="Incipit liber"))
|
| 196 |
-
|
| 197 |
-
original = config_mod.settings.data_dir
|
| 198 |
-
config_mod.settings.__dict__["data_dir"] = tmp_path
|
| 199 |
-
try:
|
| 200 |
-
resp = await async_client.get("/api/v1/search?q=xyznomatch")
|
| 201 |
-
finally:
|
| 202 |
-
config_mod.settings.__dict__["data_dir"] = original
|
| 203 |
|
|
|
|
| 204 |
assert resp.status_code == 200
|
| 205 |
assert resp.json() == []
|
| 206 |
|
| 207 |
|
| 208 |
@pytest.mark.asyncio
|
| 209 |
-
async def test_search_result_has_excerpt(async_client,
|
| 210 |
"""Chaque résultat contient un champ excerpt non vide."""
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
page_id = str(uuid.uuid4())
|
| 214 |
-
_write_master(tmp_path, "corpus-f", page_id, _make_master(page_id, diplomatic_text="Incipit liber primus"))
|
| 215 |
-
|
| 216 |
-
original = config_mod.settings.data_dir
|
| 217 |
-
config_mod.settings.__dict__["data_dir"] = tmp_path
|
| 218 |
-
try:
|
| 219 |
-
resp = await async_client.get("/api/v1/search?q=liber")
|
| 220 |
-
finally:
|
| 221 |
-
config_mod.settings.__dict__["data_dir"] = original
|
| 222 |
|
|
|
|
| 223 |
assert resp.status_code == 200
|
| 224 |
results = resp.json()
|
| 225 |
assert len(results) >= 1
|
|
@@ -227,27 +152,16 @@ async def test_search_result_has_excerpt(async_client, tmp_path):
|
|
| 227 |
|
| 228 |
|
| 229 |
@pytest.mark.asyncio
|
| 230 |
-
async def test_search_sorted_by_score_desc(async_client,
|
| 231 |
"""Les résultats sont triés par score décroissant."""
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
page_id_2 =
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
)
|
| 240 |
-
_write_master(tmp_path, "corpus-g", page_id_2, _make_master(
|
| 241 |
-
page_id_2, diplomatic_text="liber unus"
|
| 242 |
-
))
|
| 243 |
-
|
| 244 |
-
original = config_mod.settings.data_dir
|
| 245 |
-
config_mod.settings.__dict__["data_dir"] = tmp_path
|
| 246 |
-
try:
|
| 247 |
-
resp = await async_client.get("/api/v1/search?q=liber")
|
| 248 |
-
finally:
|
| 249 |
-
config_mod.settings.__dict__["data_dir"] = original
|
| 250 |
-
|
| 251 |
assert resp.status_code == 200
|
| 252 |
results = resp.json()
|
| 253 |
assert len(results) == 2
|
|
@@ -256,20 +170,11 @@ async def test_search_sorted_by_score_desc(async_client, tmp_path):
|
|
| 256 |
|
| 257 |
|
| 258 |
@pytest.mark.asyncio
|
| 259 |
-
async def test_search_result_fields(async_client,
|
| 260 |
"""Chaque résultat expose les champs attendus."""
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
page_id = str(uuid.uuid4())
|
| 264 |
-
_write_master(tmp_path, "corpus-h", page_id, _make_master(page_id, diplomatic_text="Incipit liber"))
|
| 265 |
-
|
| 266 |
-
original = config_mod.settings.data_dir
|
| 267 |
-
config_mod.settings.__dict__["data_dir"] = tmp_path
|
| 268 |
-
try:
|
| 269 |
-
resp = await async_client.get("/api/v1/search?q=Incipit")
|
| 270 |
-
finally:
|
| 271 |
-
config_mod.settings.__dict__["data_dir"] = original
|
| 272 |
|
|
|
|
| 273 |
assert resp.status_code == 200
|
| 274 |
result = resp.json()[0]
|
| 275 |
assert "page_id" in result
|
|
@@ -278,3 +183,15 @@ async def test_search_result_fields(async_client, tmp_path):
|
|
| 278 |
assert "excerpt" in result
|
| 279 |
assert "score" in result
|
| 280 |
assert "corpus_profile" in result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
Tests de l'endpoint GET /api/v1/search (Sprint 4 — recherche indexée).
|
| 3 |
|
| 4 |
Stratégie :
|
| 5 |
+
- Données indexées directement dans la table page_search (BDD en mémoire)
|
|
|
|
| 6 |
- Vérifie : 422 (paramètre manquant / trop court), résultats vides,
|
| 7 |
correspondance OCR, insensibilité casse et accents, tri par score,
|
| 8 |
extrait (excerpt) présent.
|
| 9 |
"""
|
| 10 |
# 1. stdlib
|
|
|
|
| 11 |
import uuid
|
|
|
|
|
|
|
| 12 |
|
| 13 |
# 2. third-party
|
| 14 |
import pytest
|
| 15 |
|
| 16 |
# 3. local
|
| 17 |
+
from app.models.page_search import PageSearchIndex
|
| 18 |
from tests.conftest_api import async_client, db_session # noqa: F401
|
| 19 |
|
|
|
|
|
|
|
| 20 |
|
| 21 |
# ── Helpers ────────────────────────────────────────────────────────────────────
|
| 22 |
|
| 23 |
+
async def _index_page(
|
| 24 |
+
db,
|
| 25 |
+
page_id: str | None = None,
|
| 26 |
+
diplomatic_text: str = "",
|
| 27 |
+
translation_fr: str = "",
|
| 28 |
+
tags: str = "",
|
| 29 |
+
corpus_profile: str = "medieval-illuminated",
|
| 30 |
+
manuscript_id: str = "ms-test",
|
| 31 |
+
folio_label: str = "f001r",
|
| 32 |
+
) -> str:
|
| 33 |
+
"""Insère une entrée dans page_search et retourne le page_id."""
|
| 34 |
+
pid = page_id or str(uuid.uuid4())
|
| 35 |
+
entry = PageSearchIndex(
|
| 36 |
+
page_id=pid,
|
| 37 |
+
corpus_profile=corpus_profile,
|
| 38 |
+
manuscript_id=manuscript_id,
|
| 39 |
+
folio_label=folio_label,
|
| 40 |
+
diplomatic_text=diplomatic_text,
|
| 41 |
+
translation_fr=translation_fr,
|
| 42 |
+
tags=tags,
|
| 43 |
+
)
|
| 44 |
+
db.add(entry)
|
| 45 |
+
await db.commit()
|
| 46 |
+
return pid
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
|
| 49 |
# ── Tests ──────────────────────────────────────────────────────────────────────
|
|
|
|
| 63 |
|
| 64 |
|
| 65 |
@pytest.mark.asyncio
|
| 66 |
+
async def test_search_empty_results(async_client):
|
| 67 |
+
"""Retourne [] quand aucune page ne correspond."""
|
| 68 |
+
resp = await async_client.get("/api/v1/search?q=rien")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
assert resp.status_code == 200
|
| 70 |
assert resp.json() == []
|
| 71 |
|
| 72 |
|
| 73 |
@pytest.mark.asyncio
|
| 74 |
+
async def test_search_returns_list(async_client):
|
| 75 |
"""Le type de retour est toujours une liste."""
|
| 76 |
+
resp = await async_client.get("/api/v1/search?q=texte")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
assert resp.status_code == 200
|
| 78 |
assert isinstance(resp.json(), list)
|
| 79 |
|
| 80 |
|
| 81 |
@pytest.mark.asyncio
|
| 82 |
+
async def test_search_finds_ocr_text(async_client, db_session):
|
| 83 |
+
"""Trouve une page dont diplomatic_text contient la requête."""
|
| 84 |
+
page_id = await _index_page(db_session, diplomatic_text="Incipit liber primus")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
+
resp = await async_client.get("/api/v1/search?q=Incipit")
|
| 87 |
assert resp.status_code == 200
|
| 88 |
results = resp.json()
|
| 89 |
assert len(results) == 1
|
|
|
|
| 91 |
|
| 92 |
|
| 93 |
@pytest.mark.asyncio
|
| 94 |
+
async def test_search_case_insensitive(async_client, db_session):
|
| 95 |
"""La recherche est insensible à la casse."""
|
| 96 |
+
page_id = await _index_page(db_session, diplomatic_text="INCIPIT LIBER")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
+
resp = await async_client.get("/api/v1/search?q=incipit")
|
| 99 |
assert resp.status_code == 200
|
| 100 |
results = resp.json()
|
| 101 |
assert len(results) >= 1
|
|
|
|
| 103 |
|
| 104 |
|
| 105 |
@pytest.mark.asyncio
|
| 106 |
+
async def test_search_accent_insensitive(async_client, db_session):
|
| 107 |
"""La recherche est insensible aux accents."""
|
| 108 |
+
page_id = await _index_page(
|
| 109 |
+
db_session, diplomatic_text="Édition française médiévale"
|
| 110 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
|
| 112 |
+
resp = await async_client.get("/api/v1/search?q=edition")
|
| 113 |
assert resp.status_code == 200
|
| 114 |
results = resp.json()
|
| 115 |
assert len(results) >= 1
|
|
|
|
| 117 |
|
| 118 |
|
| 119 |
@pytest.mark.asyncio
|
| 120 |
+
async def test_search_finds_translation_fr(async_client, db_session):
|
| 121 |
+
"""Trouve également dans translation_fr."""
|
| 122 |
+
page_id = await _index_page(
|
| 123 |
+
db_session, translation_fr="Ici commence le premier livre"
|
| 124 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
+
resp = await async_client.get("/api/v1/search?q=premier")
|
| 127 |
assert resp.status_code == 200
|
| 128 |
results = resp.json()
|
| 129 |
assert any(r["page_id"] == page_id for r in results)
|
| 130 |
|
| 131 |
|
| 132 |
@pytest.mark.asyncio
|
| 133 |
+
async def test_search_no_match_returns_empty(async_client, db_session):
|
| 134 |
"""Ne retourne rien quand la requête ne correspond à aucun texte."""
|
| 135 |
+
await _index_page(db_session, diplomatic_text="Incipit liber")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
+
resp = await async_client.get("/api/v1/search?q=xyznomatch")
|
| 138 |
assert resp.status_code == 200
|
| 139 |
assert resp.json() == []
|
| 140 |
|
| 141 |
|
| 142 |
@pytest.mark.asyncio
|
| 143 |
+
async def test_search_result_has_excerpt(async_client, db_session):
|
| 144 |
"""Chaque résultat contient un champ excerpt non vide."""
|
| 145 |
+
await _index_page(db_session, diplomatic_text="Incipit liber primus")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
+
resp = await async_client.get("/api/v1/search?q=liber")
|
| 148 |
assert resp.status_code == 200
|
| 149 |
results = resp.json()
|
| 150 |
assert len(results) >= 1
|
|
|
|
| 152 |
|
| 153 |
|
| 154 |
@pytest.mark.asyncio
|
| 155 |
+
async def test_search_sorted_by_score_desc(async_client, db_session):
|
| 156 |
"""Les résultats sont triés par score décroissant."""
|
| 157 |
+
page_id_1 = await _index_page(
|
| 158 |
+
db_session, diplomatic_text="liber liber liber"
|
| 159 |
+
)
|
| 160 |
+
page_id_2 = await _index_page(
|
| 161 |
+
db_session, diplomatic_text="liber unus"
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
resp = await async_client.get("/api/v1/search?q=liber")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
assert resp.status_code == 200
|
| 166 |
results = resp.json()
|
| 167 |
assert len(results) == 2
|
|
|
|
| 170 |
|
| 171 |
|
| 172 |
@pytest.mark.asyncio
|
| 173 |
+
async def test_search_result_fields(async_client, db_session):
|
| 174 |
"""Chaque résultat expose les champs attendus."""
|
| 175 |
+
await _index_page(db_session, diplomatic_text="Incipit liber")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
+
resp = await async_client.get("/api/v1/search?q=Incipit")
|
| 178 |
assert resp.status_code == 200
|
| 179 |
result = resp.json()[0]
|
| 180 |
assert "page_id" in result
|
|
|
|
| 183 |
assert "excerpt" in result
|
| 184 |
assert "score" in result
|
| 185 |
assert "corpus_profile" in result
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
@pytest.mark.asyncio
|
| 189 |
+
async def test_search_finds_tags(async_client, db_session):
|
| 190 |
+
"""Trouve dans les tags iconographiques."""
|
| 191 |
+
page_id = await _index_page(db_session, tags="apocalypse sceau martyrs")
|
| 192 |
+
|
| 193 |
+
resp = await async_client.get("/api/v1/search?q=apocalypse")
|
| 194 |
+
assert resp.status_code == 200
|
| 195 |
+
results = resp.json()
|
| 196 |
+
assert len(results) >= 1
|
| 197 |
+
assert any(r["page_id"] == page_id for r in results)
|
backend/tests/test_search_index.py
ADDED
|
@@ -0,0 +1,463 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests pour le service d'indexation et de recherche (page_search + indexer).
|
| 3 |
+
"""
|
| 4 |
+
import json
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from unittest.mock import patch
|
| 7 |
+
|
| 8 |
+
import pytest
|
| 9 |
+
import pytest_asyncio
|
| 10 |
+
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
| 11 |
+
|
| 12 |
+
import app.models # noqa: F401 — enregistrement des modeles
|
| 13 |
+
from app.models.database import Base
|
| 14 |
+
from app.schemas.page_master import PageMaster
|
| 15 |
+
from app.services.search.indexer import (
|
| 16 |
+
_extract_tags,
|
| 17 |
+
_normalize,
|
| 18 |
+
index_page,
|
| 19 |
+
reindex_all,
|
| 20 |
+
search_pages,
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# ── Fixtures ──────────────────────────────────────────────────────────────────
|
| 25 |
+
|
| 26 |
+
_TEST_DB_URL = "sqlite+aiosqlite:///:memory:"
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@pytest_asyncio.fixture
|
| 30 |
+
async def db():
|
| 31 |
+
"""Session AsyncSession sur une BDD SQLite en memoire."""
|
| 32 |
+
engine = create_async_engine(_TEST_DB_URL, echo=False)
|
| 33 |
+
async with engine.begin() as conn:
|
| 34 |
+
await conn.run_sync(Base.metadata.create_all)
|
| 35 |
+
|
| 36 |
+
factory = async_sessionmaker(engine, expire_on_commit=False)
|
| 37 |
+
async with factory() as session:
|
| 38 |
+
yield session
|
| 39 |
+
|
| 40 |
+
async with engine.begin() as conn:
|
| 41 |
+
await conn.run_sync(Base.metadata.drop_all)
|
| 42 |
+
await engine.dispose()
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _make_master(
|
| 46 |
+
page_id: str = "test-ms-001r",
|
| 47 |
+
corpus_profile: str = "medieval-illuminated",
|
| 48 |
+
manuscript_id: str = "test-ms",
|
| 49 |
+
folio_label: str = "001r",
|
| 50 |
+
diplomatic_text: str = "Explicit liber primus",
|
| 51 |
+
translation_fr: str = "Fin du premier livre",
|
| 52 |
+
tags: list[str] | None = None,
|
| 53 |
+
) -> PageMaster:
|
| 54 |
+
"""Construit un PageMaster minimal valide pour les tests."""
|
| 55 |
+
extensions: dict = {}
|
| 56 |
+
if tags:
|
| 57 |
+
extensions["iconography"] = [{"region_id": "r1", "tags": tags}]
|
| 58 |
+
|
| 59 |
+
data = {
|
| 60 |
+
"schema_version": "1.0",
|
| 61 |
+
"page_id": page_id,
|
| 62 |
+
"corpus_profile": corpus_profile,
|
| 63 |
+
"manuscript_id": manuscript_id,
|
| 64 |
+
"folio_label": folio_label,
|
| 65 |
+
"sequence": 1,
|
| 66 |
+
"image": {
|
| 67 |
+
"master": "https://example.com/image.jpg",
|
| 68 |
+
"width": 3000,
|
| 69 |
+
"height": 4000,
|
| 70 |
+
},
|
| 71 |
+
"layout": {
|
| 72 |
+
"regions": [
|
| 73 |
+
{
|
| 74 |
+
"id": "r1",
|
| 75 |
+
"type": "text_block",
|
| 76 |
+
"bbox": [100, 100, 500, 500],
|
| 77 |
+
"confidence": 0.9,
|
| 78 |
+
}
|
| 79 |
+
]
|
| 80 |
+
},
|
| 81 |
+
"ocr": {
|
| 82 |
+
"diplomatic_text": diplomatic_text,
|
| 83 |
+
"language": "la",
|
| 84 |
+
"confidence": 0.8,
|
| 85 |
+
},
|
| 86 |
+
"translation": {"fr": translation_fr, "en": ""},
|
| 87 |
+
"extensions": extensions,
|
| 88 |
+
}
|
| 89 |
+
return PageMaster.model_validate(data)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
# ── Tests _normalize ──────────────────────────────────────────────────────────
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
class TestNormalize:
|
| 96 |
+
def test_lowercase(self):
|
| 97 |
+
assert _normalize("HELLO") == "hello"
|
| 98 |
+
|
| 99 |
+
def test_accent_removal(self):
|
| 100 |
+
assert _normalize("éàü") == "eau"
|
| 101 |
+
|
| 102 |
+
def test_combined(self):
|
| 103 |
+
assert _normalize("Début du Récit") == "debut du recit"
|
| 104 |
+
|
| 105 |
+
def test_empty(self):
|
| 106 |
+
assert _normalize("") == ""
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
# ── Tests _extract_tags ───────────────────────────────────────────────────────
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
class TestExtractTags:
|
| 113 |
+
def test_with_tags(self):
|
| 114 |
+
master = _make_master(tags=["apocalypse", "martyrs", "autel"])
|
| 115 |
+
result = _extract_tags(master)
|
| 116 |
+
assert "apocalypse" in result
|
| 117 |
+
assert "martyrs" in result
|
| 118 |
+
assert "autel" in result
|
| 119 |
+
|
| 120 |
+
def test_no_tags(self):
|
| 121 |
+
master = _make_master(tags=None)
|
| 122 |
+
result = _extract_tags(master)
|
| 123 |
+
assert result == ""
|
| 124 |
+
|
| 125 |
+
def test_empty_extensions(self):
|
| 126 |
+
master = _make_master()
|
| 127 |
+
# Force extensions to empty dict
|
| 128 |
+
data = master.model_dump(mode="json")
|
| 129 |
+
data["extensions"] = {}
|
| 130 |
+
m = PageMaster.model_validate(data)
|
| 131 |
+
assert _extract_tags(m) == ""
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
# ── Tests index_page ─────────────────────────────────────────────────────────
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
class TestIndexPage:
|
| 138 |
+
@pytest.mark.asyncio
|
| 139 |
+
async def test_index_new_page(self, db: AsyncSession):
|
| 140 |
+
master = _make_master()
|
| 141 |
+
await index_page(db, master)
|
| 142 |
+
await db.commit()
|
| 143 |
+
|
| 144 |
+
# Verify it was inserted
|
| 145 |
+
from app.models.page_search import PageSearchIndex
|
| 146 |
+
|
| 147 |
+
row = await db.get(PageSearchIndex, master.page_id)
|
| 148 |
+
assert row is not None
|
| 149 |
+
assert row.page_id == "test-ms-001r"
|
| 150 |
+
assert row.diplomatic_text == "Explicit liber primus"
|
| 151 |
+
assert row.translation_fr == "Fin du premier livre"
|
| 152 |
+
assert row.manuscript_id == "test-ms"
|
| 153 |
+
|
| 154 |
+
@pytest.mark.asyncio
|
| 155 |
+
async def test_index_update_existing(self, db: AsyncSession):
|
| 156 |
+
master = _make_master(diplomatic_text="version 1")
|
| 157 |
+
await index_page(db, master)
|
| 158 |
+
await db.commit()
|
| 159 |
+
|
| 160 |
+
# Update with new content
|
| 161 |
+
master2 = _make_master(diplomatic_text="version 2")
|
| 162 |
+
await index_page(db, master2)
|
| 163 |
+
await db.commit()
|
| 164 |
+
|
| 165 |
+
from app.models.page_search import PageSearchIndex
|
| 166 |
+
|
| 167 |
+
row = await db.get(PageSearchIndex, master.page_id)
|
| 168 |
+
assert row is not None
|
| 169 |
+
assert row.diplomatic_text == "version 2"
|
| 170 |
+
|
| 171 |
+
@pytest.mark.asyncio
|
| 172 |
+
async def test_index_page_without_ocr(self, db: AsyncSession):
|
| 173 |
+
data = {
|
| 174 |
+
"schema_version": "1.0",
|
| 175 |
+
"page_id": "no-ocr-page",
|
| 176 |
+
"corpus_profile": "medieval-illuminated",
|
| 177 |
+
"manuscript_id": "test-ms",
|
| 178 |
+
"folio_label": "001r",
|
| 179 |
+
"sequence": 1,
|
| 180 |
+
"image": {
|
| 181 |
+
"master": "https://example.com/image.jpg",
|
| 182 |
+
"width": 3000,
|
| 183 |
+
"height": 4000,
|
| 184 |
+
},
|
| 185 |
+
"layout": {"regions": []},
|
| 186 |
+
"ocr": None,
|
| 187 |
+
"translation": None,
|
| 188 |
+
}
|
| 189 |
+
master = PageMaster.model_validate(data)
|
| 190 |
+
await index_page(db, master)
|
| 191 |
+
await db.commit()
|
| 192 |
+
|
| 193 |
+
from app.models.page_search import PageSearchIndex
|
| 194 |
+
|
| 195 |
+
row = await db.get(PageSearchIndex, "no-ocr-page")
|
| 196 |
+
assert row is not None
|
| 197 |
+
assert row.diplomatic_text == ""
|
| 198 |
+
assert row.translation_fr == ""
|
| 199 |
+
|
| 200 |
+
@pytest.mark.asyncio
|
| 201 |
+
async def test_index_page_with_tags(self, db: AsyncSession):
|
| 202 |
+
master = _make_master(tags=["sceau", "martyrs"])
|
| 203 |
+
await index_page(db, master)
|
| 204 |
+
await db.commit()
|
| 205 |
+
|
| 206 |
+
from app.models.page_search import PageSearchIndex
|
| 207 |
+
|
| 208 |
+
row = await db.get(PageSearchIndex, master.page_id)
|
| 209 |
+
assert row is not None
|
| 210 |
+
assert "sceau" in row.tags
|
| 211 |
+
assert "martyrs" in row.tags
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
# ── Tests search_pages ────────────────────────────────────────────────────────
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
class TestSearchPages:
|
| 218 |
+
@pytest.mark.asyncio
|
| 219 |
+
async def test_search_finds_diplomatic_text(self, db: AsyncSession):
|
| 220 |
+
master = _make_master(diplomatic_text="Explicit liber primus incipit")
|
| 221 |
+
await index_page(db, master)
|
| 222 |
+
await db.commit()
|
| 223 |
+
|
| 224 |
+
hits = await search_pages(db, "liber")
|
| 225 |
+
assert len(hits) == 1
|
| 226 |
+
assert hits[0]["page_id"] == "test-ms-001r"
|
| 227 |
+
assert hits[0]["score"] >= 1
|
| 228 |
+
|
| 229 |
+
@pytest.mark.asyncio
|
| 230 |
+
async def test_search_finds_translation(self, db: AsyncSession):
|
| 231 |
+
master = _make_master(translation_fr="Fin du premier livre")
|
| 232 |
+
await index_page(db, master)
|
| 233 |
+
await db.commit()
|
| 234 |
+
|
| 235 |
+
hits = await search_pages(db, "premier")
|
| 236 |
+
assert len(hits) == 1
|
| 237 |
+
assert hits[0]["page_id"] == "test-ms-001r"
|
| 238 |
+
|
| 239 |
+
@pytest.mark.asyncio
|
| 240 |
+
async def test_search_finds_tags(self, db: AsyncSession):
|
| 241 |
+
master = _make_master(tags=["apocalypse", "martyrs"])
|
| 242 |
+
await index_page(db, master)
|
| 243 |
+
await db.commit()
|
| 244 |
+
|
| 245 |
+
hits = await search_pages(db, "apocalypse")
|
| 246 |
+
assert len(hits) == 1
|
| 247 |
+
|
| 248 |
+
@pytest.mark.asyncio
|
| 249 |
+
async def test_accent_insensitive_search(self, db: AsyncSession):
|
| 250 |
+
master = _make_master(translation_fr="Début du récit apocalyptique")
|
| 251 |
+
await index_page(db, master)
|
| 252 |
+
await db.commit()
|
| 253 |
+
|
| 254 |
+
# Search without accents
|
| 255 |
+
hits = await search_pages(db, "debut")
|
| 256 |
+
assert len(hits) == 1
|
| 257 |
+
|
| 258 |
+
# Search with accents
|
| 259 |
+
hits = await search_pages(db, "début")
|
| 260 |
+
assert len(hits) == 1
|
| 261 |
+
|
| 262 |
+
# Search with wrong accents
|
| 263 |
+
hits = await search_pages(db, "recit")
|
| 264 |
+
assert len(hits) == 1
|
| 265 |
+
|
| 266 |
+
@pytest.mark.asyncio
|
| 267 |
+
async def test_case_insensitive_search(self, db: AsyncSession):
|
| 268 |
+
master = _make_master(diplomatic_text="Explicit Liber Primus")
|
| 269 |
+
await index_page(db, master)
|
| 270 |
+
await db.commit()
|
| 271 |
+
|
| 272 |
+
hits = await search_pages(db, "EXPLICIT")
|
| 273 |
+
assert len(hits) == 1
|
| 274 |
+
|
| 275 |
+
hits = await search_pages(db, "explicit")
|
| 276 |
+
assert len(hits) == 1
|
| 277 |
+
|
| 278 |
+
@pytest.mark.asyncio
|
| 279 |
+
async def test_empty_query_returns_nothing(self, db: AsyncSession):
|
| 280 |
+
master = _make_master()
|
| 281 |
+
await index_page(db, master)
|
| 282 |
+
await db.commit()
|
| 283 |
+
|
| 284 |
+
hits = await search_pages(db, "")
|
| 285 |
+
assert hits == []
|
| 286 |
+
|
| 287 |
+
hits = await search_pages(db, " ")
|
| 288 |
+
assert hits == []
|
| 289 |
+
|
| 290 |
+
@pytest.mark.asyncio
|
| 291 |
+
async def test_no_match_returns_empty(self, db: AsyncSession):
|
| 292 |
+
master = _make_master(diplomatic_text="Explicit liber primus")
|
| 293 |
+
await index_page(db, master)
|
| 294 |
+
await db.commit()
|
| 295 |
+
|
| 296 |
+
hits = await search_pages(db, "zzzznonexistent")
|
| 297 |
+
assert hits == []
|
| 298 |
+
|
| 299 |
+
@pytest.mark.asyncio
|
| 300 |
+
async def test_results_sorted_by_score(self, db: AsyncSession):
|
| 301 |
+
# Page with many occurrences
|
| 302 |
+
master1 = _make_master(
|
| 303 |
+
page_id="ms-high",
|
| 304 |
+
folio_label="001r",
|
| 305 |
+
diplomatic_text="liber liber liber liber liber",
|
| 306 |
+
)
|
| 307 |
+
# Page with fewer occurrences
|
| 308 |
+
master2 = _make_master(
|
| 309 |
+
page_id="ms-low",
|
| 310 |
+
folio_label="002r",
|
| 311 |
+
diplomatic_text="liber primus",
|
| 312 |
+
)
|
| 313 |
+
await index_page(db, master1)
|
| 314 |
+
await index_page(db, master2)
|
| 315 |
+
await db.commit()
|
| 316 |
+
|
| 317 |
+
hits = await search_pages(db, "liber")
|
| 318 |
+
assert len(hits) == 2
|
| 319 |
+
assert hits[0]["page_id"] == "ms-high"
|
| 320 |
+
assert hits[0]["score"] > hits[1]["score"]
|
| 321 |
+
|
| 322 |
+
@pytest.mark.asyncio
|
| 323 |
+
async def test_limit_parameter(self, db: AsyncSession):
|
| 324 |
+
# Index 5 pages
|
| 325 |
+
for i in range(5):
|
| 326 |
+
master = _make_master(
|
| 327 |
+
page_id=f"ms-{i:03d}r",
|
| 328 |
+
folio_label=f"{i:03d}r",
|
| 329 |
+
diplomatic_text="common text shared across all pages",
|
| 330 |
+
)
|
| 331 |
+
await index_page(db, master)
|
| 332 |
+
await db.commit()
|
| 333 |
+
|
| 334 |
+
hits = await search_pages(db, "common", limit=3)
|
| 335 |
+
assert len(hits) == 3
|
| 336 |
+
|
| 337 |
+
@pytest.mark.asyncio
|
| 338 |
+
async def test_excerpt_is_populated(self, db: AsyncSession):
|
| 339 |
+
master = _make_master(diplomatic_text="Before context Explicit liber primus after context")
|
| 340 |
+
await index_page(db, master)
|
| 341 |
+
await db.commit()
|
| 342 |
+
|
| 343 |
+
hits = await search_pages(db, "liber")
|
| 344 |
+
assert len(hits) == 1
|
| 345 |
+
assert "liber" in hits[0]["excerpt"].lower()
|
| 346 |
+
|
| 347 |
+
@pytest.mark.asyncio
|
| 348 |
+
async def test_search_across_multiple_fields(self, db: AsyncSession):
|
| 349 |
+
"""A page matching in multiple fields should have a higher score."""
|
| 350 |
+
# Page matching in both diplomatic and translation
|
| 351 |
+
master1 = _make_master(
|
| 352 |
+
page_id="ms-multi",
|
| 353 |
+
diplomatic_text="liber primus",
|
| 354 |
+
translation_fr="liber premier",
|
| 355 |
+
)
|
| 356 |
+
# Page matching in diplomatic only
|
| 357 |
+
master2 = _make_master(
|
| 358 |
+
page_id="ms-single",
|
| 359 |
+
diplomatic_text="liber primus",
|
| 360 |
+
translation_fr="rien a voir",
|
| 361 |
+
)
|
| 362 |
+
await index_page(db, master1)
|
| 363 |
+
await index_page(db, master2)
|
| 364 |
+
await db.commit()
|
| 365 |
+
|
| 366 |
+
hits = await search_pages(db, "liber")
|
| 367 |
+
assert len(hits) == 2
|
| 368 |
+
assert hits[0]["page_id"] == "ms-multi"
|
| 369 |
+
assert hits[0]["score"] > hits[1]["score"]
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
# ── Tests reindex_all ─────────────────────────────────────────────────────────
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
class TestReindexAll:
|
| 376 |
+
@pytest.mark.asyncio
|
| 377 |
+
async def test_reindex_from_filesystem(self, db: AsyncSession, tmp_path: Path):
|
| 378 |
+
"""reindex_all should read master.json files and populate the index."""
|
| 379 |
+
# Create a fake corpus directory structure
|
| 380 |
+
corpus_dir = tmp_path / "corpora" / "test-ms" / "pages" / "001r"
|
| 381 |
+
corpus_dir.mkdir(parents=True)
|
| 382 |
+
|
| 383 |
+
master_data = {
|
| 384 |
+
"schema_version": "1.0",
|
| 385 |
+
"page_id": "test-ms-001r",
|
| 386 |
+
"corpus_profile": "medieval-illuminated",
|
| 387 |
+
"manuscript_id": "test-ms",
|
| 388 |
+
"folio_label": "001r",
|
| 389 |
+
"sequence": 1,
|
| 390 |
+
"image": {
|
| 391 |
+
"master": "https://example.com/image.jpg",
|
| 392 |
+
"width": 3000,
|
| 393 |
+
"height": 4000,
|
| 394 |
+
},
|
| 395 |
+
"layout": {"regions": []},
|
| 396 |
+
"ocr": {
|
| 397 |
+
"diplomatic_text": "Explicit liber primus",
|
| 398 |
+
"language": "la",
|
| 399 |
+
"confidence": 0.8,
|
| 400 |
+
},
|
| 401 |
+
"translation": {"fr": "Fin du premier livre", "en": ""},
|
| 402 |
+
}
|
| 403 |
+
(corpus_dir / "master.json").write_text(
|
| 404 |
+
json.dumps(master_data), encoding="utf-8"
|
| 405 |
+
)
|
| 406 |
+
|
| 407 |
+
count = await reindex_all(db, tmp_path)
|
| 408 |
+
assert count == 1
|
| 409 |
+
|
| 410 |
+
# Verify the page was indexed
|
| 411 |
+
hits = await search_pages(db, "liber")
|
| 412 |
+
assert len(hits) == 1
|
| 413 |
+
assert hits[0]["page_id"] == "test-ms-001r"
|
| 414 |
+
|
| 415 |
+
@pytest.mark.asyncio
|
| 416 |
+
async def test_reindex_skips_invalid_files(self, db: AsyncSession, tmp_path: Path):
|
| 417 |
+
"""reindex_all should skip invalid master.json files gracefully."""
|
| 418 |
+
corpus_dir = tmp_path / "corpora" / "test-ms" / "pages" / "bad"
|
| 419 |
+
corpus_dir.mkdir(parents=True)
|
| 420 |
+
|
| 421 |
+
# Write invalid JSON
|
| 422 |
+
(corpus_dir / "master.json").write_text("not valid json", encoding="utf-8")
|
| 423 |
+
|
| 424 |
+
count = await reindex_all(db, tmp_path)
|
| 425 |
+
assert count == 0
|
| 426 |
+
|
| 427 |
+
@pytest.mark.asyncio
|
| 428 |
+
async def test_reindex_empty_dir(self, db: AsyncSession, tmp_path: Path):
|
| 429 |
+
"""reindex_all on an empty data dir should return 0."""
|
| 430 |
+
count = await reindex_all(db, tmp_path)
|
| 431 |
+
assert count == 0
|
| 432 |
+
|
| 433 |
+
@pytest.mark.asyncio
|
| 434 |
+
async def test_reindex_multiple_pages(self, db: AsyncSession, tmp_path: Path):
|
| 435 |
+
"""reindex_all with multiple valid master.json files."""
|
| 436 |
+
for folio in ["001r", "002r", "003r"]:
|
| 437 |
+
page_dir = tmp_path / "corpora" / "test-ms" / "pages" / folio
|
| 438 |
+
page_dir.mkdir(parents=True)
|
| 439 |
+
data = {
|
| 440 |
+
"schema_version": "1.0",
|
| 441 |
+
"page_id": f"test-ms-{folio}",
|
| 442 |
+
"corpus_profile": "medieval-illuminated",
|
| 443 |
+
"manuscript_id": "test-ms",
|
| 444 |
+
"folio_label": folio,
|
| 445 |
+
"sequence": int(folio[:3]),
|
| 446 |
+
"image": {
|
| 447 |
+
"master": "https://example.com/image.jpg",
|
| 448 |
+
"width": 3000,
|
| 449 |
+
"height": 4000,
|
| 450 |
+
},
|
| 451 |
+
"layout": {"regions": []},
|
| 452 |
+
"ocr": {
|
| 453 |
+
"diplomatic_text": f"Text for folio {folio}",
|
| 454 |
+
"language": "la",
|
| 455 |
+
"confidence": 0.8,
|
| 456 |
+
},
|
| 457 |
+
}
|
| 458 |
+
(page_dir / "master.json").write_text(
|
| 459 |
+
json.dumps(data), encoding="utf-8"
|
| 460 |
+
)
|
| 461 |
+
|
| 462 |
+
count = await reindex_all(db, tmp_path)
|
| 463 |
+
assert count == 3
|