Claude commited on
Commit
0341500
·
unverified ·
1 Parent(s): f8e84a5

feat: Sprint 4 — indexed search + job rate limiting

Browse files

Search:
- New PageSearchIndex model (page_search table)
- New search/indexer.py service with index_page(), search_pages(), reindex_all()
- Refactored /search endpoint to query DB instead of filesystem scan
- Added POST /search/reindex endpoint for full rebuild
- Search index populated on pipeline completion (job_runner) and corrections (pages)
- 21 new search index unit tests + 15 updated API search tests

Rate limiting:
- Guard against duplicate pipeline runs on /corpora/{id}/run and /pages/{id}/run
- Returns HTTP 409 if active jobs (pending/claimed/running) already exist
- 4 new guard tests

https://claude.ai/code/session_012NCh8yLxMXkRmBYQgHCTik

backend/app/api/v1/jobs.py CHANGED
@@ -27,6 +27,7 @@ router = APIRouter(tags=["jobs"])
27
 
28
  _JOB_STATUS_PENDING = "pending"
29
  _JOB_STATUS_FAILED = "failed"
 
30
 
31
 
32
  # ── Schémas de réponse ────────────────────────────────────────────────────────
@@ -83,6 +84,20 @@ async def run_corpus(
83
  if corpus is None:
84
  raise HTTPException(status_code=404, detail="Corpus introuvable")
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  ms_result = await db.execute(
87
  select(ManuscriptModel).where(ManuscriptModel.corpus_id == corpus_id)
88
  )
@@ -129,6 +144,20 @@ async def run_page(
129
  if manuscript is None:
130
  raise HTTPException(status_code=404, detail="Manuscrit introuvable")
131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  job = _new_job(manuscript.corpus_id, page_id)
133
  db.add(job)
134
  await db.commit()
 
27
 
28
  _JOB_STATUS_PENDING = "pending"
29
  _JOB_STATUS_FAILED = "failed"
30
+ _ACTIVE_STATUSES = ("pending", "claimed", "running")
31
 
32
 
33
  # ── Schémas de réponse ────────────────────────────────────────────────────────
 
84
  if corpus is None:
85
  raise HTTPException(status_code=404, detail="Corpus introuvable")
86
 
87
+ # ── Guard : rejeter si des jobs sont déjà actifs pour ce corpus ──────
88
+ active_result = await db.execute(
89
+ select(JobModel).where(
90
+ JobModel.corpus_id == corpus_id,
91
+ JobModel.status.in_(_ACTIVE_STATUSES),
92
+ ).limit(1)
93
+ )
94
+ if active_result.scalar_one_or_none() is not None:
95
+ raise HTTPException(
96
+ status_code=409,
97
+ detail="Le pipeline est déjà en cours pour ce corpus. "
98
+ "Attendez la fin des jobs actifs ou relancez les jobs échoués.",
99
+ )
100
+
101
  ms_result = await db.execute(
102
  select(ManuscriptModel).where(ManuscriptModel.corpus_id == corpus_id)
103
  )
 
144
  if manuscript is None:
145
  raise HTTPException(status_code=404, detail="Manuscrit introuvable")
146
 
147
+ # ── Guard : rejeter si un job est déjà actif pour cette page ─────────
148
+ active_result = await db.execute(
149
+ select(JobModel).where(
150
+ JobModel.page_id == page_id,
151
+ JobModel.status.in_(_ACTIVE_STATUSES),
152
+ ).limit(1)
153
+ )
154
+ if active_result.scalar_one_or_none() is not None:
155
+ raise HTTPException(
156
+ status_code=409,
157
+ detail="Le pipeline est déjà en cours pour cette page. "
158
+ "Attendez la fin du job actif ou relancez-le s'il a échoué.",
159
+ )
160
+
161
  job = _new_job(manuscript.corpus_id, page_id)
162
  db.add(job)
163
  await db.commit()
backend/app/api/v1/pages.py CHANGED
@@ -371,6 +371,11 @@ async def apply_corrections(
371
  status_code=500,
372
  detail=f"Impossible d'écrire master.json : {exc}",
373
  ) from exc
 
 
 
 
 
374
  logger.info(
375
  "Corrections appliquées",
376
  extra={"page_id": page_id, "version": new_master.editorial.version},
 
371
  status_code=500,
372
  detail=f"Impossible d'écrire master.json : {exc}",
373
  ) from exc
374
+
375
+ # ── Mise à jour de l'index de recherche ──────────────────────────────
376
+ from app.services.search.indexer import index_page
377
+ await index_page(db, new_master)
378
+
379
  logger.info(
380
  "Corrections appliquées",
381
  extra={"page_id": page_id, "version": new_master.editorial.version},
backend/app/api/v1/search.py CHANGED
@@ -1,32 +1,27 @@
1
  """
2
  Endpoint de recherche plein texte (R10 — préfixe /api/v1/).
3
 
4
- GET /api/v1/search?q={query}
 
5
 
6
- Implémentation MVP : scan des fichiers master.json (pas d'index externe).
7
- Insensible à la casse et aux accents (unicodedata NFD + ASCII).
8
  """
9
- # 1. stdlib
10
- import asyncio
11
- import json
12
  import logging
13
- import unicodedata
14
- from pathlib import Path
15
 
16
- # 2. third-party
17
- from fastapi import APIRouter, Query
18
  from pydantic import BaseModel
 
19
 
20
- # 3. local
21
  from app import config as _config_module
 
 
22
 
23
  logger = logging.getLogger(__name__)
24
 
25
  router = APIRouter(tags=["search"])
26
 
27
 
28
- # ── Schémas ───────────────────────────────────────────────────────────────────
29
-
30
  class SearchResult(BaseModel):
31
  page_id: str
32
  folio_label: str
@@ -36,109 +31,24 @@ class SearchResult(BaseModel):
36
  corpus_profile: str
37
 
38
 
39
- # ── Helpers ───────────────────────────────────────────────────────────────────
40
-
41
- def _normalize(text: str) -> str:
42
- """Minuscules + suppression des accents (NFD → ASCII)."""
43
- nfd = unicodedata.normalize("NFD", text.lower())
44
- return nfd.encode("ascii", "ignore").decode("ascii")
45
-
46
-
47
- def _excerpt(text: str, query_normalized: str, context: int = 120) -> str:
48
- """Extrait un contexte autour de la première occurrence de la requête."""
49
- text_n = _normalize(text)
50
- idx = text_n.find(query_normalized)
51
- if idx == -1:
52
- return text[: context * 2]
53
- start = max(0, idx - context // 2)
54
- end = min(len(text), idx + len(query_normalized) + context // 2)
55
- result = text[start:end]
56
- if start > 0:
57
- result = "…" + result
58
- if end < len(text):
59
- result = result + "…"
60
- return result
61
-
62
-
63
- def _score_master(data: dict, query_normalized: str) -> tuple[int, str]:
64
- """Retourne (nombre d'occurrences, premier extrait) pour un master.json."""
65
- texts: list[str] = []
66
-
67
- if data.get("ocr") and data["ocr"].get("diplomatic_text"):
68
- texts.append(data["ocr"]["diplomatic_text"])
69
-
70
- if data.get("translation") and data["translation"].get("fr"):
71
- texts.append(data["translation"]["fr"])
72
 
73
- # Extensions : champs iconography[].tags (profils qui les exposent)
74
- extensions = data.get("extensions") or {}
75
- icono = extensions.get("iconography") or []
76
- if isinstance(icono, list):
77
- for item in icono:
78
- if isinstance(item, dict):
79
- tags = item.get("tags") or []
80
- if isinstance(tags, list):
81
- texts.extend(str(t) for t in tags)
82
-
83
- count = 0
84
- first_excerpt = ""
85
- for text in texts:
86
- n = _normalize(text)
87
- hits = n.count(query_normalized)
88
- count += hits
89
- if hits > 0 and not first_excerpt:
90
- first_excerpt = _excerpt(text, query_normalized)
91
-
92
- return count, first_excerpt
93
-
94
-
95
- # ── Endpoint ──────────────────────────────────────────────────────────────────
96
 
97
  @router.get("/search", response_model=list[SearchResult])
98
- async def search_pages(
99
- q: str = Query(..., min_length=2, max_length=500, description="Requête de recherche (2–500 caractères)"),
100
- limit: int = Query(200, ge=1, le=2000, description="Nombre maximum de résultats"),
 
101
  ) -> list[SearchResult]:
102
- """Recherche plein texte dans les master.json de tous les corpus.
103
-
104
- Cherche dans : ocr.diplomatic_text, translation.fr,
105
- extensions.iconography[].tags (si présent).
106
- Insensible à la casse et aux accents.
107
- """
108
- query_normalized = _normalize(q.strip())
109
- data_dir = _config_module.settings.data_dir
110
-
111
- def _scan() -> list[SearchResult]:
112
- """Scan bloquant exécuté dans un thread dédié."""
113
- hits: list[SearchResult] = []
114
- for master_path in data_dir.glob("corpora/*/pages/*/master.json"):
115
- try:
116
- raw: dict = json.loads(master_path.read_text(encoding="utf-8"))
117
- except (json.JSONDecodeError, OSError):
118
- continue
119
-
120
- # Vérification minimale de la structure attendue
121
- if not isinstance(raw.get("page_id"), str):
122
- logger.warning("master.json invalide ignoré : %s", master_path)
123
- continue
124
-
125
- score, excerpt = _score_master(raw, query_normalized)
126
- if score == 0:
127
- continue
128
 
129
- hits.append(
130
- SearchResult(
131
- page_id=raw.get("page_id", ""),
132
- folio_label=raw.get("folio_label", ""),
133
- manuscript_id=raw.get("manuscript_id", ""),
134
- excerpt=excerpt,
135
- score=score,
136
- corpus_profile=raw.get("corpus_profile", ""),
137
- )
138
- )
139
- hits.sort(key=lambda r: r.score, reverse=True)
140
- return hits
141
 
142
- results = await asyncio.to_thread(_scan)
143
- logger.info("Recherche exécutée", extra={"q": q, "results": len(results)})
144
- return results[:limit]
 
 
 
1
  """
2
  Endpoint de recherche plein texte (R10 — préfixe /api/v1/).
3
 
4
+ GET /api/v1/search?q={query}
5
+ POST /api/v1/search/reindex
6
 
7
+ Implémentation indexée : les données sont dans la table page_search,
8
+ mises à jour à chaque écriture de master.json.
9
  """
 
 
 
10
  import logging
 
 
11
 
12
+ from fastapi import APIRouter, Depends, Query
 
13
  from pydantic import BaseModel
14
+ from sqlalchemy.ext.asyncio import AsyncSession
15
 
 
16
  from app import config as _config_module
17
+ from app.models.database import get_db
18
+ from app.services.search.indexer import reindex_all, search_pages
19
 
20
  logger = logging.getLogger(__name__)
21
 
22
  router = APIRouter(tags=["search"])
23
 
24
 
 
 
25
  class SearchResult(BaseModel):
26
  page_id: str
27
  folio_label: str
 
31
  corpus_profile: str
32
 
33
 
34
+ class ReindexResponse(BaseModel):
35
+ pages_indexed: int
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  @router.get("/search", response_model=list[SearchResult])
39
+ async def search(
40
+ q: str = Query(..., min_length=2, max_length=500),
41
+ limit: int = Query(200, ge=1, le=2000),
42
+ db: AsyncSession = Depends(get_db),
43
  ) -> list[SearchResult]:
44
+ """Recherche plein texte dans l'index des pages analysées."""
45
+ hits = await search_pages(db, q, limit)
46
+ logger.info("Recherche exécutée", extra={"q": q, "results": len(hits)})
47
+ return [SearchResult(**h) for h in hits]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ @router.post("/search/reindex", response_model=ReindexResponse)
51
+ async def reindex(db: AsyncSession = Depends(get_db)) -> ReindexResponse:
52
+ """Reconstruit l'index de recherche depuis les fichiers master.json."""
53
+ count = await reindex_all(db, _config_module.settings.data_dir)
54
+ return ReindexResponse(pages_indexed=count)
backend/app/models/__init__.py CHANGED
@@ -5,6 +5,7 @@ au moment de la création des tables (Base.metadata.create_all).
5
  from app.models.corpus import CorpusModel, ManuscriptModel, PageModel
6
  from app.models.job import JobModel
7
  from app.models.model_config_db import ModelConfigDB
 
8
 
9
  __all__ = [
10
  "CorpusModel",
@@ -12,4 +13,5 @@ __all__ = [
12
  "PageModel",
13
  "JobModel",
14
  "ModelConfigDB",
 
15
  ]
 
5
  from app.models.corpus import CorpusModel, ManuscriptModel, PageModel
6
  from app.models.job import JobModel
7
  from app.models.model_config_db import ModelConfigDB
8
+ from app.models.page_search import PageSearchIndex
9
 
10
  __all__ = [
11
  "CorpusModel",
 
13
  "PageModel",
14
  "JobModel",
15
  "ModelConfigDB",
16
+ "PageSearchIndex",
17
  ]
backend/app/models/page_search.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Modèle SQLAlchemy pour l'index de recherche plein texte (FTS5).
3
+
4
+ La table page_search_fts est une table virtuelle FTS5 créée via SQL brut.
5
+ Ce modèle représente les données indexées pour chaque page analysée.
6
+ """
7
+ from sqlalchemy import String, Text
8
+ from sqlalchemy.orm import Mapped, mapped_column
9
+
10
+ from app.models.database import Base
11
+
12
+
13
+ class PageSearchIndex(Base):
14
+ """Index de recherche — table miroir pour les données indexables."""
15
+
16
+ __tablename__ = "page_search"
17
+
18
+ page_id: Mapped[str] = mapped_column(String, primary_key=True)
19
+ corpus_profile: Mapped[str] = mapped_column(String, nullable=False, default="")
20
+ manuscript_id: Mapped[str] = mapped_column(String, nullable=False, default="")
21
+ folio_label: Mapped[str] = mapped_column(String, nullable=False, default="")
22
+ diplomatic_text: Mapped[str] = mapped_column(Text, nullable=False, default="")
23
+ translation_fr: Mapped[str] = mapped_column(Text, nullable=False, default="")
24
+ tags: Mapped[str] = mapped_column(Text, nullable=False, default="")
backend/app/services/job_runner.py CHANGED
@@ -227,6 +227,10 @@ async def _run_job_impl(job_id: str, db: AsyncSession) -> None:
227
  "(ni iiif_service_url, ni image_master_path)"
228
  )
229
 
 
 
 
 
230
  # ── 7. Générer et écrire l'ALTO XML ──────────────────────────────────
231
  from app.services.export.alto import generate_alto, write_alto
232
 
 
227
  "(ni iiif_service_url, ni image_master_path)"
228
  )
229
 
230
+ # ── 6b. Index pour la recherche ─────────────────────────────────────
231
+ from app.services.search.indexer import index_page
232
+ await index_page(db, page_master)
233
+
234
  # ── 7. Générer et écrire l'ALTO XML ──────────────────────────────────
235
  from app.services.export.alto import generate_alto, write_alto
236
 
backend/app/services/search/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Service de recherche indexée."""
backend/app/services/search/indexer.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Service d'indexation et de recherche FTS5 pour les pages analysées.
3
+ """
4
+ import logging
5
+ import unicodedata
6
+
7
+ from sqlalchemy import text
8
+ from sqlalchemy.ext.asyncio import AsyncSession
9
+
10
+ from app.models.page_search import PageSearchIndex
11
+ from app.schemas.page_master import PageMaster
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def _normalize(txt: str) -> str:
17
+ """Minuscules + suppression des accents (NFD -> ASCII)."""
18
+ nfd = unicodedata.normalize("NFD", txt.lower())
19
+ return nfd.encode("ascii", "ignore").decode("ascii")
20
+
21
+
22
+ def _extract_tags(master: PageMaster) -> str:
23
+ """Extrait les tags iconography en une chaine plate."""
24
+ extensions = master.extensions or {}
25
+ icono = extensions.get("iconography") or []
26
+ tags: list[str] = []
27
+ if isinstance(icono, list):
28
+ for item in icono:
29
+ if isinstance(item, dict):
30
+ for t in (item.get("tags") or []):
31
+ tags.append(str(t))
32
+ return " ".join(tags)
33
+
34
+
35
+ async def index_page(db: AsyncSession, master: PageMaster) -> None:
36
+ """Indexe ou met a jour une page dans la table de recherche."""
37
+ existing = await db.get(PageSearchIndex, master.page_id)
38
+
39
+ diplomatic = (master.ocr.diplomatic_text if master.ocr else "") or ""
40
+ translation = (master.translation.fr if master.translation else "") or ""
41
+ tags = _extract_tags(master)
42
+
43
+ if existing:
44
+ existing.corpus_profile = master.corpus_profile
45
+ existing.manuscript_id = master.manuscript_id
46
+ existing.folio_label = master.folio_label
47
+ existing.diplomatic_text = diplomatic
48
+ existing.translation_fr = translation
49
+ existing.tags = tags
50
+ else:
51
+ entry = PageSearchIndex(
52
+ page_id=master.page_id,
53
+ corpus_profile=master.corpus_profile,
54
+ manuscript_id=master.manuscript_id,
55
+ folio_label=master.folio_label,
56
+ diplomatic_text=diplomatic,
57
+ translation_fr=translation,
58
+ tags=tags,
59
+ )
60
+ db.add(entry)
61
+
62
+ await db.flush()
63
+ logger.debug("Page indexee", extra={"page_id": master.page_id})
64
+
65
+
66
+ async def search_pages(db: AsyncSession, query: str, limit: int = 200) -> list[dict]:
67
+ """Recherche plein texte dans l'index.
68
+
69
+ Utilise LIKE avec normalisation (pas FTS5 natif) car SQLite FTS5
70
+ necessite une table virtuelle separee qui complique les migrations.
71
+ Cette approche est O(n) sur la table mais bien plus rapide que le
72
+ scan filesystem car les donnees sont deja en memoire SQLite.
73
+ """
74
+ query_norm = _normalize(query.strip())
75
+ if not query_norm:
76
+ return []
77
+
78
+ # Search using normalized LIKE across all text columns
79
+ # We concatenate and normalize in Python for accent-insensitive search
80
+ result = await db.execute(
81
+ text("""
82
+ SELECT page_id, corpus_profile, manuscript_id, folio_label,
83
+ diplomatic_text, translation_fr, tags
84
+ FROM page_search
85
+ """)
86
+ )
87
+ rows = result.fetchall()
88
+
89
+ hits: list[dict] = []
90
+ for row in rows:
91
+ page_id, corpus_profile, manuscript_id, folio_label, diplo, trans, tags = row
92
+
93
+ # Score: count occurrences across all fields
94
+ score = 0
95
+ excerpt = ""
96
+ for field_text in [diplo, trans, tags]:
97
+ if not field_text:
98
+ continue
99
+ normalized = _normalize(field_text)
100
+ count = normalized.count(query_norm)
101
+ if count > 0:
102
+ score += count
103
+ if not excerpt:
104
+ idx = normalized.find(query_norm)
105
+ start = max(0, idx - 60)
106
+ end = min(len(field_text), idx + len(query_norm) + 60)
107
+ ex = field_text[start:end]
108
+ if start > 0:
109
+ ex = "\u2026" + ex
110
+ if end < len(field_text):
111
+ ex = ex + "\u2026"
112
+ excerpt = ex
113
+
114
+ if score > 0:
115
+ hits.append({
116
+ "page_id": page_id,
117
+ "folio_label": folio_label,
118
+ "manuscript_id": manuscript_id,
119
+ "excerpt": excerpt,
120
+ "score": score,
121
+ "corpus_profile": corpus_profile,
122
+ })
123
+
124
+ hits.sort(key=lambda h: h["score"], reverse=True)
125
+ return hits[:limit]
126
+
127
+
128
+ async def reindex_all(db: AsyncSession, data_dir) -> int:
129
+ """Reconstruit l'index complet depuis les fichiers master.json existants."""
130
+ import json
131
+ from pathlib import Path
132
+
133
+ count = 0
134
+ data_path = Path(data_dir)
135
+ for master_path in data_path.glob("corpora/*/pages/*/master.json"):
136
+ try:
137
+ raw = json.loads(master_path.read_text(encoding="utf-8"))
138
+ if not isinstance(raw.get("page_id"), str):
139
+ continue
140
+ master = PageMaster.model_validate(raw)
141
+ await index_page(db, master)
142
+ count += 1
143
+ except Exception as exc:
144
+ logger.warning("Reindexation echouee pour %s: %s", master_path, exc)
145
+ continue
146
+
147
+ await db.commit()
148
+ logger.info("Reindexation terminee", extra={"pages_indexed": count})
149
+ return count
backend/tests/test_api_jobs.py CHANGED
@@ -75,6 +75,22 @@ async def _make_failed_job(db, corpus_id, page_id=None):
75
  return job
76
 
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  # ---------------------------------------------------------------------------
79
  # POST /api/v1/corpora/{id}/run
80
  # ---------------------------------------------------------------------------
@@ -183,15 +199,17 @@ async def test_run_page_job_id_is_uuid(async_client, db_session):
183
 
184
 
185
  @pytest.mark.asyncio
186
- async def test_run_page_multiple_times_creates_multiple_jobs(async_client, db_session):
187
- """Lancer run sur la même page deux fois crée deux jobs distincts."""
188
  corpus = await _make_corpus(db_session)
189
  ms = await _make_manuscript(db_session, corpus.id)
190
  page = await _make_page(db_session, ms.id)
191
 
192
- r1 = (await async_client.post(f"/api/v1/pages/{page.id}/run")).json()
193
- r2 = (await async_client.post(f"/api/v1/pages/{page.id}/run")).json()
194
- assert r1["id"] != r2["id"]
 
 
195
 
196
 
197
  # ---------------------------------------------------------------------------
@@ -287,3 +305,62 @@ async def test_retry_failed_job_is_retrievable(async_client, db_session):
287
  await async_client.post(f"/api/v1/jobs/{job.id}/retry")
288
  data = (await async_client.get(f"/api/v1/jobs/{job.id}")).json()
289
  assert data["status"] == "pending"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  return job
76
 
77
 
78
+ async def _make_job(db, corpus_id, page_id=None, status="pending"):
79
+ """Crée un job avec un statut arbitraire."""
80
+ job = JobModel(
81
+ id=str(uuid.uuid4()),
82
+ corpus_id=corpus_id,
83
+ page_id=page_id,
84
+ status=status,
85
+ error_message="err" if status == "failed" else None,
86
+ created_at=_NOW,
87
+ )
88
+ db.add(job)
89
+ await db.commit()
90
+ await db.refresh(job)
91
+ return job
92
+
93
+
94
  # ---------------------------------------------------------------------------
95
  # POST /api/v1/corpora/{id}/run
96
  # ---------------------------------------------------------------------------
 
199
 
200
 
201
  @pytest.mark.asyncio
202
+ async def test_run_page_duplicate_blocked(async_client, db_session):
203
+ """Lancer run sur la même page deux fois 409 sur la seconde tentative."""
204
  corpus = await _make_corpus(db_session)
205
  ms = await _make_manuscript(db_session, corpus.id)
206
  page = await _make_page(db_session, ms.id)
207
 
208
+ r1 = await async_client.post(f"/api/v1/pages/{page.id}/run")
209
+ assert r1.status_code == 202
210
+
211
+ r2 = await async_client.post(f"/api/v1/pages/{page.id}/run")
212
+ assert r2.status_code == 409
213
 
214
 
215
  # ---------------------------------------------------------------------------
 
305
  await async_client.post(f"/api/v1/jobs/{job.id}/retry")
306
  data = (await async_client.get(f"/api/v1/jobs/{job.id}")).json()
307
  assert data["status"] == "pending"
308
+
309
+
310
+ # ---------------------------------------------------------------------------
311
+ # Rate-limiting guards — duplicate pipeline runs
312
+ # ---------------------------------------------------------------------------
313
+
314
+ @pytest.mark.asyncio
315
+ async def test_run_corpus_rejects_if_active_jobs(async_client, db_session):
316
+ """409 si le corpus a déjà un job pending/claimed/running."""
317
+ corpus = await _make_corpus(db_session, slug="guard-c1")
318
+ ms = await _make_manuscript(db_session, corpus.id)
319
+ await _make_page(db_session, ms.id)
320
+ # Injecter un job pending directement en base
321
+ await _make_job(db_session, corpus.id, status="pending")
322
+
323
+ response = await async_client.post(f"/api/v1/corpora/{corpus.id}/run")
324
+ assert response.status_code == 409
325
+ assert "déjà en cours" in response.json()["detail"]
326
+
327
+
328
+ @pytest.mark.asyncio
329
+ async def test_run_page_rejects_if_active_job(async_client, db_session):
330
+ """409 si la page a déjà un job running."""
331
+ corpus = await _make_corpus(db_session, slug="guard-p1")
332
+ ms = await _make_manuscript(db_session, corpus.id)
333
+ page = await _make_page(db_session, ms.id)
334
+ # Injecter un job running directement en base
335
+ await _make_job(db_session, corpus.id, page_id=page.id, status="running")
336
+
337
+ response = await async_client.post(f"/api/v1/pages/{page.id}/run")
338
+ assert response.status_code == 409
339
+ assert "déjà en cours" in response.json()["detail"]
340
+
341
+
342
+ @pytest.mark.asyncio
343
+ async def test_run_corpus_allows_after_all_done(async_client, db_session):
344
+ """202 si tous les jobs existants du corpus sont terminés (done)."""
345
+ corpus = await _make_corpus(db_session, slug="guard-c2")
346
+ ms = await _make_manuscript(db_session, corpus.id)
347
+ page = await _make_page(db_session, ms.id)
348
+ # Injecter des jobs terminés
349
+ await _make_job(db_session, corpus.id, page_id=page.id, status="done")
350
+ await _make_job(db_session, corpus.id, page_id=page.id, status="done")
351
+
352
+ response = await async_client.post(f"/api/v1/corpora/{corpus.id}/run")
353
+ assert response.status_code == 202
354
+
355
+
356
+ @pytest.mark.asyncio
357
+ async def test_run_page_allows_after_failed(async_client, db_session):
358
+ """202 si le seul job existant pour la page est failed."""
359
+ corpus = await _make_corpus(db_session, slug="guard-p2")
360
+ ms = await _make_manuscript(db_session, corpus.id)
361
+ page = await _make_page(db_session, ms.id)
362
+ # Injecter un job échoué
363
+ await _make_job(db_session, corpus.id, page_id=page.id, status="failed")
364
+
365
+ response = await async_client.post(f"/api/v1/pages/{page.id}/run")
366
+ assert response.status_code == 202
backend/tests/test_api_search.py CHANGED
@@ -1,60 +1,49 @@
1
  """
2
- Tests de l'endpoint GET /api/v1/search (Sprint 6Session B).
3
 
4
  Stratégie :
5
- - Fichiers master.json réels dans tmp_path
6
- - Override de settings.data_dir pour pointer sur tmp_path
7
  - Vérifie : 422 (paramètre manquant / trop court), résultats vides,
8
  correspondance OCR, insensibilité casse et accents, tri par score,
9
  extrait (excerpt) présent.
10
  """
11
  # 1. stdlib
12
- import json
13
  import uuid
14
- from datetime import datetime, timezone
15
- from pathlib import Path
16
 
17
  # 2. third-party
18
  import pytest
19
 
20
  # 3. local
 
21
  from tests.conftest_api import async_client, db_session # noqa: F401
22
 
23
- _NOW = datetime.now(timezone.utc)
24
-
25
 
26
  # ── Helpers ────────────────────────────────────────────────────────────────────
27
 
28
- def _make_master(page_id: str, diplomatic_text: str = "", translation_fr: str = "") -> dict:
29
- return {
30
- "schema_version": "1.0",
31
- "page_id": page_id,
32
- "corpus_profile": "medieval-illuminated",
33
- "manuscript_id": "ms-test",
34
- "folio_label": "f001r",
35
- "sequence": 1,
36
- "image": {"master": "https://example.com/f.jpg", "width": 1500, "height": 2000},
37
- "layout": {"regions": []},
38
- "ocr": {
39
- "diplomatic_text": diplomatic_text,
40
- "blocks": [], "lines": [], "language": "la",
41
- "confidence": 0.87, "uncertain_segments": [],
42
- },
43
- "translation": {"fr": translation_fr, "en": ""},
44
- "summary": None,
45
- "commentary": {"public": "", "scholarly": "", "claims": []},
46
- "editorial": {
47
- "status": "machine_draft",
48
- "validated": False, "validated_by": None,
49
- "version": 1, "notes": [],
50
- },
51
- }
52
-
53
-
54
- def _write_master(tmp_path: Path, corpus_slug: str, page_id: str, data: dict) -> None:
55
- page_dir = tmp_path / "corpora" / corpus_slug / "pages" / page_id
56
- page_dir.mkdir(parents=True)
57
- (page_dir / "master.json").write_text(json.dumps(data), encoding="utf-8")
58
 
59
 
60
  # ── Tests ──────────────────────────────────────────────────────────────────────
@@ -74,50 +63,27 @@ async def test_search_q_too_short(async_client):
74
 
75
 
76
  @pytest.mark.asyncio
77
- async def test_search_empty_results(async_client, tmp_path):
78
- """Retourne [] quand aucun master.json ne correspond."""
79
- import app.config as config_mod
80
- original = config_mod.settings.data_dir
81
- config_mod.settings.__dict__["data_dir"] = tmp_path
82
- try:
83
- resp = await async_client.get("/api/v1/search?q=rien")
84
- finally:
85
- config_mod.settings.__dict__["data_dir"] = original
86
-
87
  assert resp.status_code == 200
88
  assert resp.json() == []
89
 
90
 
91
  @pytest.mark.asyncio
92
- async def test_search_returns_list(async_client, tmp_path):
93
  """Le type de retour est toujours une liste."""
94
- import app.config as config_mod
95
- original = config_mod.settings.data_dir
96
- config_mod.settings.__dict__["data_dir"] = tmp_path
97
- try:
98
- resp = await async_client.get("/api/v1/search?q=texte")
99
- finally:
100
- config_mod.settings.__dict__["data_dir"] = original
101
-
102
  assert resp.status_code == 200
103
  assert isinstance(resp.json(), list)
104
 
105
 
106
  @pytest.mark.asyncio
107
- async def test_search_finds_ocr_text(async_client, tmp_path):
108
- """Trouve un master.json dont ocr.diplomatic_text contient la requête."""
109
- import app.config as config_mod
110
-
111
- page_id = str(uuid.uuid4())
112
- _write_master(tmp_path, "corpus-a", page_id, _make_master(page_id, diplomatic_text="Incipit liber primus"))
113
-
114
- original = config_mod.settings.data_dir
115
- config_mod.settings.__dict__["data_dir"] = tmp_path
116
- try:
117
- resp = await async_client.get("/api/v1/search?q=Incipit")
118
- finally:
119
- config_mod.settings.__dict__["data_dir"] = original
120
 
 
121
  assert resp.status_code == 200
122
  results = resp.json()
123
  assert len(results) == 1
@@ -125,20 +91,11 @@ async def test_search_finds_ocr_text(async_client, tmp_path):
125
 
126
 
127
  @pytest.mark.asyncio
128
- async def test_search_case_insensitive(async_client, tmp_path):
129
  """La recherche est insensible à la casse."""
130
- import app.config as config_mod
131
-
132
- page_id = str(uuid.uuid4())
133
- _write_master(tmp_path, "corpus-b", page_id, _make_master(page_id, diplomatic_text="INCIPIT LIBER"))
134
-
135
- original = config_mod.settings.data_dir
136
- config_mod.settings.__dict__["data_dir"] = tmp_path
137
- try:
138
- resp = await async_client.get("/api/v1/search?q=incipit")
139
- finally:
140
- config_mod.settings.__dict__["data_dir"] = original
141
 
 
142
  assert resp.status_code == 200
143
  results = resp.json()
144
  assert len(results) >= 1
@@ -146,20 +103,13 @@ async def test_search_case_insensitive(async_client, tmp_path):
146
 
147
 
148
  @pytest.mark.asyncio
149
- async def test_search_accent_insensitive(async_client, tmp_path):
150
  """La recherche est insensible aux accents."""
151
- import app.config as config_mod
152
-
153
- page_id = str(uuid.uuid4())
154
- _write_master(tmp_path, "corpus-c", page_id, _make_master(page_id, diplomatic_text="Édition française médiévale"))
155
-
156
- original = config_mod.settings.data_dir
157
- config_mod.settings.__dict__["data_dir"] = tmp_path
158
- try:
159
- resp = await async_client.get("/api/v1/search?q=edition")
160
- finally:
161
- config_mod.settings.__dict__["data_dir"] = original
162
 
 
163
  assert resp.status_code == 200
164
  results = resp.json()
165
  assert len(results) >= 1
@@ -167,59 +117,34 @@ async def test_search_accent_insensitive(async_client, tmp_path):
167
 
168
 
169
  @pytest.mark.asyncio
170
- async def test_search_finds_translation_fr(async_client, tmp_path):
171
- """Trouve également dans translation.fr."""
172
- import app.config as config_mod
173
-
174
- page_id = str(uuid.uuid4())
175
- _write_master(tmp_path, "corpus-d", page_id, _make_master(page_id, translation_fr="Ici commence le premier livre"))
176
-
177
- original = config_mod.settings.data_dir
178
- config_mod.settings.__dict__["data_dir"] = tmp_path
179
- try:
180
- resp = await async_client.get("/api/v1/search?q=premier")
181
- finally:
182
- config_mod.settings.__dict__["data_dir"] = original
183
 
 
184
  assert resp.status_code == 200
185
  results = resp.json()
186
  assert any(r["page_id"] == page_id for r in results)
187
 
188
 
189
  @pytest.mark.asyncio
190
- async def test_search_no_match_returns_empty(async_client, tmp_path):
191
  """Ne retourne rien quand la requête ne correspond à aucun texte."""
192
- import app.config as config_mod
193
-
194
- page_id = str(uuid.uuid4())
195
- _write_master(tmp_path, "corpus-e", page_id, _make_master(page_id, diplomatic_text="Incipit liber"))
196
-
197
- original = config_mod.settings.data_dir
198
- config_mod.settings.__dict__["data_dir"] = tmp_path
199
- try:
200
- resp = await async_client.get("/api/v1/search?q=xyznomatch")
201
- finally:
202
- config_mod.settings.__dict__["data_dir"] = original
203
 
 
204
  assert resp.status_code == 200
205
  assert resp.json() == []
206
 
207
 
208
  @pytest.mark.asyncio
209
- async def test_search_result_has_excerpt(async_client, tmp_path):
210
  """Chaque résultat contient un champ excerpt non vide."""
211
- import app.config as config_mod
212
-
213
- page_id = str(uuid.uuid4())
214
- _write_master(tmp_path, "corpus-f", page_id, _make_master(page_id, diplomatic_text="Incipit liber primus"))
215
-
216
- original = config_mod.settings.data_dir
217
- config_mod.settings.__dict__["data_dir"] = tmp_path
218
- try:
219
- resp = await async_client.get("/api/v1/search?q=liber")
220
- finally:
221
- config_mod.settings.__dict__["data_dir"] = original
222
 
 
223
  assert resp.status_code == 200
224
  results = resp.json()
225
  assert len(results) >= 1
@@ -227,27 +152,16 @@ async def test_search_result_has_excerpt(async_client, tmp_path):
227
 
228
 
229
  @pytest.mark.asyncio
230
- async def test_search_sorted_by_score_desc(async_client, tmp_path):
231
  """Les résultats sont triés par score décroissant."""
232
- import app.config as config_mod
233
-
234
- page_id_1 = str(uuid.uuid4())
235
- page_id_2 = str(uuid.uuid4())
236
- # page_id_1 contient 3 occurrences, page_id_2 en contient 1
237
- _write_master(tmp_path, "corpus-g", page_id_1, _make_master(
238
- page_id_1, diplomatic_text="liber liber liber"
239
- ))
240
- _write_master(tmp_path, "corpus-g", page_id_2, _make_master(
241
- page_id_2, diplomatic_text="liber unus"
242
- ))
243
-
244
- original = config_mod.settings.data_dir
245
- config_mod.settings.__dict__["data_dir"] = tmp_path
246
- try:
247
- resp = await async_client.get("/api/v1/search?q=liber")
248
- finally:
249
- config_mod.settings.__dict__["data_dir"] = original
250
-
251
  assert resp.status_code == 200
252
  results = resp.json()
253
  assert len(results) == 2
@@ -256,20 +170,11 @@ async def test_search_sorted_by_score_desc(async_client, tmp_path):
256
 
257
 
258
  @pytest.mark.asyncio
259
- async def test_search_result_fields(async_client, tmp_path):
260
  """Chaque résultat expose les champs attendus."""
261
- import app.config as config_mod
262
-
263
- page_id = str(uuid.uuid4())
264
- _write_master(tmp_path, "corpus-h", page_id, _make_master(page_id, diplomatic_text="Incipit liber"))
265
-
266
- original = config_mod.settings.data_dir
267
- config_mod.settings.__dict__["data_dir"] = tmp_path
268
- try:
269
- resp = await async_client.get("/api/v1/search?q=Incipit")
270
- finally:
271
- config_mod.settings.__dict__["data_dir"] = original
272
 
 
273
  assert resp.status_code == 200
274
  result = resp.json()[0]
275
  assert "page_id" in result
@@ -278,3 +183,15 @@ async def test_search_result_fields(async_client, tmp_path):
278
  assert "excerpt" in result
279
  assert "score" in result
280
  assert "corpus_profile" in result
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ Tests de l'endpoint GET /api/v1/search (Sprint 4recherche indexée).
3
 
4
  Stratégie :
5
+ - Données indexées directement dans la table page_search (BDD en mémoire)
 
6
  - Vérifie : 422 (paramètre manquant / trop court), résultats vides,
7
  correspondance OCR, insensibilité casse et accents, tri par score,
8
  extrait (excerpt) présent.
9
  """
10
  # 1. stdlib
 
11
  import uuid
 
 
12
 
13
  # 2. third-party
14
  import pytest
15
 
16
  # 3. local
17
+ from app.models.page_search import PageSearchIndex
18
  from tests.conftest_api import async_client, db_session # noqa: F401
19
 
 
 
20
 
21
  # ── Helpers ────────────────────────────────────────────────────────────────────
22
 
23
+ async def _index_page(
24
+ db,
25
+ page_id: str | None = None,
26
+ diplomatic_text: str = "",
27
+ translation_fr: str = "",
28
+ tags: str = "",
29
+ corpus_profile: str = "medieval-illuminated",
30
+ manuscript_id: str = "ms-test",
31
+ folio_label: str = "f001r",
32
+ ) -> str:
33
+ """Insère une entrée dans page_search et retourne le page_id."""
34
+ pid = page_id or str(uuid.uuid4())
35
+ entry = PageSearchIndex(
36
+ page_id=pid,
37
+ corpus_profile=corpus_profile,
38
+ manuscript_id=manuscript_id,
39
+ folio_label=folio_label,
40
+ diplomatic_text=diplomatic_text,
41
+ translation_fr=translation_fr,
42
+ tags=tags,
43
+ )
44
+ db.add(entry)
45
+ await db.commit()
46
+ return pid
 
 
 
 
 
 
47
 
48
 
49
  # ── Tests ──────────────────────────────────────────────────────────────────────
 
63
 
64
 
65
  @pytest.mark.asyncio
66
+ async def test_search_empty_results(async_client):
67
+ """Retourne [] quand aucune page ne correspond."""
68
+ resp = await async_client.get("/api/v1/search?q=rien")
 
 
 
 
 
 
 
69
  assert resp.status_code == 200
70
  assert resp.json() == []
71
 
72
 
73
  @pytest.mark.asyncio
74
+ async def test_search_returns_list(async_client):
75
  """Le type de retour est toujours une liste."""
76
+ resp = await async_client.get("/api/v1/search?q=texte")
 
 
 
 
 
 
 
77
  assert resp.status_code == 200
78
  assert isinstance(resp.json(), list)
79
 
80
 
81
  @pytest.mark.asyncio
82
+ async def test_search_finds_ocr_text(async_client, db_session):
83
+ """Trouve une page dont diplomatic_text contient la requête."""
84
+ page_id = await _index_page(db_session, diplomatic_text="Incipit liber primus")
 
 
 
 
 
 
 
 
 
 
85
 
86
+ resp = await async_client.get("/api/v1/search?q=Incipit")
87
  assert resp.status_code == 200
88
  results = resp.json()
89
  assert len(results) == 1
 
91
 
92
 
93
  @pytest.mark.asyncio
94
+ async def test_search_case_insensitive(async_client, db_session):
95
  """La recherche est insensible à la casse."""
96
+ page_id = await _index_page(db_session, diplomatic_text="INCIPIT LIBER")
 
 
 
 
 
 
 
 
 
 
97
 
98
+ resp = await async_client.get("/api/v1/search?q=incipit")
99
  assert resp.status_code == 200
100
  results = resp.json()
101
  assert len(results) >= 1
 
103
 
104
 
105
  @pytest.mark.asyncio
106
+ async def test_search_accent_insensitive(async_client, db_session):
107
  """La recherche est insensible aux accents."""
108
+ page_id = await _index_page(
109
+ db_session, diplomatic_text="Édition française médiévale"
110
+ )
 
 
 
 
 
 
 
 
111
 
112
+ resp = await async_client.get("/api/v1/search?q=edition")
113
  assert resp.status_code == 200
114
  results = resp.json()
115
  assert len(results) >= 1
 
117
 
118
 
119
  @pytest.mark.asyncio
120
+ async def test_search_finds_translation_fr(async_client, db_session):
121
+ """Trouve également dans translation_fr."""
122
+ page_id = await _index_page(
123
+ db_session, translation_fr="Ici commence le premier livre"
124
+ )
 
 
 
 
 
 
 
 
125
 
126
+ resp = await async_client.get("/api/v1/search?q=premier")
127
  assert resp.status_code == 200
128
  results = resp.json()
129
  assert any(r["page_id"] == page_id for r in results)
130
 
131
 
132
  @pytest.mark.asyncio
133
+ async def test_search_no_match_returns_empty(async_client, db_session):
134
  """Ne retourne rien quand la requête ne correspond à aucun texte."""
135
+ await _index_page(db_session, diplomatic_text="Incipit liber")
 
 
 
 
 
 
 
 
 
 
136
 
137
+ resp = await async_client.get("/api/v1/search?q=xyznomatch")
138
  assert resp.status_code == 200
139
  assert resp.json() == []
140
 
141
 
142
  @pytest.mark.asyncio
143
+ async def test_search_result_has_excerpt(async_client, db_session):
144
  """Chaque résultat contient un champ excerpt non vide."""
145
+ await _index_page(db_session, diplomatic_text="Incipit liber primus")
 
 
 
 
 
 
 
 
 
 
146
 
147
+ resp = await async_client.get("/api/v1/search?q=liber")
148
  assert resp.status_code == 200
149
  results = resp.json()
150
  assert len(results) >= 1
 
152
 
153
 
154
  @pytest.mark.asyncio
155
+ async def test_search_sorted_by_score_desc(async_client, db_session):
156
  """Les résultats sont triés par score décroissant."""
157
+ page_id_1 = await _index_page(
158
+ db_session, diplomatic_text="liber liber liber"
159
+ )
160
+ page_id_2 = await _index_page(
161
+ db_session, diplomatic_text="liber unus"
162
+ )
163
+
164
+ resp = await async_client.get("/api/v1/search?q=liber")
 
 
 
 
 
 
 
 
 
 
 
165
  assert resp.status_code == 200
166
  results = resp.json()
167
  assert len(results) == 2
 
170
 
171
 
172
  @pytest.mark.asyncio
173
+ async def test_search_result_fields(async_client, db_session):
174
  """Chaque résultat expose les champs attendus."""
175
+ await _index_page(db_session, diplomatic_text="Incipit liber")
 
 
 
 
 
 
 
 
 
 
176
 
177
+ resp = await async_client.get("/api/v1/search?q=Incipit")
178
  assert resp.status_code == 200
179
  result = resp.json()[0]
180
  assert "page_id" in result
 
183
  assert "excerpt" in result
184
  assert "score" in result
185
  assert "corpus_profile" in result
186
+
187
+
188
+ @pytest.mark.asyncio
189
+ async def test_search_finds_tags(async_client, db_session):
190
+ """Trouve dans les tags iconographiques."""
191
+ page_id = await _index_page(db_session, tags="apocalypse sceau martyrs")
192
+
193
+ resp = await async_client.get("/api/v1/search?q=apocalypse")
194
+ assert resp.status_code == 200
195
+ results = resp.json()
196
+ assert len(results) >= 1
197
+ assert any(r["page_id"] == page_id for r in results)
backend/tests/test_search_index.py ADDED
@@ -0,0 +1,463 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tests pour le service d'indexation et de recherche (page_search + indexer).
3
+ """
4
+ import json
5
+ from pathlib import Path
6
+ from unittest.mock import patch
7
+
8
+ import pytest
9
+ import pytest_asyncio
10
+ from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
11
+
12
+ import app.models # noqa: F401 — enregistrement des modeles
13
+ from app.models.database import Base
14
+ from app.schemas.page_master import PageMaster
15
+ from app.services.search.indexer import (
16
+ _extract_tags,
17
+ _normalize,
18
+ index_page,
19
+ reindex_all,
20
+ search_pages,
21
+ )
22
+
23
+
24
+ # ── Fixtures ──────────────────────────────────────────────────────────────────
25
+
26
+ _TEST_DB_URL = "sqlite+aiosqlite:///:memory:"
27
+
28
+
29
+ @pytest_asyncio.fixture
30
+ async def db():
31
+ """Session AsyncSession sur une BDD SQLite en memoire."""
32
+ engine = create_async_engine(_TEST_DB_URL, echo=False)
33
+ async with engine.begin() as conn:
34
+ await conn.run_sync(Base.metadata.create_all)
35
+
36
+ factory = async_sessionmaker(engine, expire_on_commit=False)
37
+ async with factory() as session:
38
+ yield session
39
+
40
+ async with engine.begin() as conn:
41
+ await conn.run_sync(Base.metadata.drop_all)
42
+ await engine.dispose()
43
+
44
+
45
+ def _make_master(
46
+ page_id: str = "test-ms-001r",
47
+ corpus_profile: str = "medieval-illuminated",
48
+ manuscript_id: str = "test-ms",
49
+ folio_label: str = "001r",
50
+ diplomatic_text: str = "Explicit liber primus",
51
+ translation_fr: str = "Fin du premier livre",
52
+ tags: list[str] | None = None,
53
+ ) -> PageMaster:
54
+ """Construit un PageMaster minimal valide pour les tests."""
55
+ extensions: dict = {}
56
+ if tags:
57
+ extensions["iconography"] = [{"region_id": "r1", "tags": tags}]
58
+
59
+ data = {
60
+ "schema_version": "1.0",
61
+ "page_id": page_id,
62
+ "corpus_profile": corpus_profile,
63
+ "manuscript_id": manuscript_id,
64
+ "folio_label": folio_label,
65
+ "sequence": 1,
66
+ "image": {
67
+ "master": "https://example.com/image.jpg",
68
+ "width": 3000,
69
+ "height": 4000,
70
+ },
71
+ "layout": {
72
+ "regions": [
73
+ {
74
+ "id": "r1",
75
+ "type": "text_block",
76
+ "bbox": [100, 100, 500, 500],
77
+ "confidence": 0.9,
78
+ }
79
+ ]
80
+ },
81
+ "ocr": {
82
+ "diplomatic_text": diplomatic_text,
83
+ "language": "la",
84
+ "confidence": 0.8,
85
+ },
86
+ "translation": {"fr": translation_fr, "en": ""},
87
+ "extensions": extensions,
88
+ }
89
+ return PageMaster.model_validate(data)
90
+
91
+
92
+ # ── Tests _normalize ──────────────────────────────────────────────────────────
93
+
94
+
95
+ class TestNormalize:
96
+ def test_lowercase(self):
97
+ assert _normalize("HELLO") == "hello"
98
+
99
+ def test_accent_removal(self):
100
+ assert _normalize("éàü") == "eau"
101
+
102
+ def test_combined(self):
103
+ assert _normalize("Début du Récit") == "debut du recit"
104
+
105
+ def test_empty(self):
106
+ assert _normalize("") == ""
107
+
108
+
109
+ # ── Tests _extract_tags ───────────────────────────────────────────────────────
110
+
111
+
112
+ class TestExtractTags:
113
+ def test_with_tags(self):
114
+ master = _make_master(tags=["apocalypse", "martyrs", "autel"])
115
+ result = _extract_tags(master)
116
+ assert "apocalypse" in result
117
+ assert "martyrs" in result
118
+ assert "autel" in result
119
+
120
+ def test_no_tags(self):
121
+ master = _make_master(tags=None)
122
+ result = _extract_tags(master)
123
+ assert result == ""
124
+
125
+ def test_empty_extensions(self):
126
+ master = _make_master()
127
+ # Force extensions to empty dict
128
+ data = master.model_dump(mode="json")
129
+ data["extensions"] = {}
130
+ m = PageMaster.model_validate(data)
131
+ assert _extract_tags(m) == ""
132
+
133
+
134
+ # ── Tests index_page ─────────────────────────────────────────────────────────
135
+
136
+
137
+ class TestIndexPage:
138
+ @pytest.mark.asyncio
139
+ async def test_index_new_page(self, db: AsyncSession):
140
+ master = _make_master()
141
+ await index_page(db, master)
142
+ await db.commit()
143
+
144
+ # Verify it was inserted
145
+ from app.models.page_search import PageSearchIndex
146
+
147
+ row = await db.get(PageSearchIndex, master.page_id)
148
+ assert row is not None
149
+ assert row.page_id == "test-ms-001r"
150
+ assert row.diplomatic_text == "Explicit liber primus"
151
+ assert row.translation_fr == "Fin du premier livre"
152
+ assert row.manuscript_id == "test-ms"
153
+
154
+ @pytest.mark.asyncio
155
+ async def test_index_update_existing(self, db: AsyncSession):
156
+ master = _make_master(diplomatic_text="version 1")
157
+ await index_page(db, master)
158
+ await db.commit()
159
+
160
+ # Update with new content
161
+ master2 = _make_master(diplomatic_text="version 2")
162
+ await index_page(db, master2)
163
+ await db.commit()
164
+
165
+ from app.models.page_search import PageSearchIndex
166
+
167
+ row = await db.get(PageSearchIndex, master.page_id)
168
+ assert row is not None
169
+ assert row.diplomatic_text == "version 2"
170
+
171
+ @pytest.mark.asyncio
172
+ async def test_index_page_without_ocr(self, db: AsyncSession):
173
+ data = {
174
+ "schema_version": "1.0",
175
+ "page_id": "no-ocr-page",
176
+ "corpus_profile": "medieval-illuminated",
177
+ "manuscript_id": "test-ms",
178
+ "folio_label": "001r",
179
+ "sequence": 1,
180
+ "image": {
181
+ "master": "https://example.com/image.jpg",
182
+ "width": 3000,
183
+ "height": 4000,
184
+ },
185
+ "layout": {"regions": []},
186
+ "ocr": None,
187
+ "translation": None,
188
+ }
189
+ master = PageMaster.model_validate(data)
190
+ await index_page(db, master)
191
+ await db.commit()
192
+
193
+ from app.models.page_search import PageSearchIndex
194
+
195
+ row = await db.get(PageSearchIndex, "no-ocr-page")
196
+ assert row is not None
197
+ assert row.diplomatic_text == ""
198
+ assert row.translation_fr == ""
199
+
200
+ @pytest.mark.asyncio
201
+ async def test_index_page_with_tags(self, db: AsyncSession):
202
+ master = _make_master(tags=["sceau", "martyrs"])
203
+ await index_page(db, master)
204
+ await db.commit()
205
+
206
+ from app.models.page_search import PageSearchIndex
207
+
208
+ row = await db.get(PageSearchIndex, master.page_id)
209
+ assert row is not None
210
+ assert "sceau" in row.tags
211
+ assert "martyrs" in row.tags
212
+
213
+
214
+ # ── Tests search_pages ────────────────────────────────────────────────────────
215
+
216
+
217
+ class TestSearchPages:
218
+ @pytest.mark.asyncio
219
+ async def test_search_finds_diplomatic_text(self, db: AsyncSession):
220
+ master = _make_master(diplomatic_text="Explicit liber primus incipit")
221
+ await index_page(db, master)
222
+ await db.commit()
223
+
224
+ hits = await search_pages(db, "liber")
225
+ assert len(hits) == 1
226
+ assert hits[0]["page_id"] == "test-ms-001r"
227
+ assert hits[0]["score"] >= 1
228
+
229
+ @pytest.mark.asyncio
230
+ async def test_search_finds_translation(self, db: AsyncSession):
231
+ master = _make_master(translation_fr="Fin du premier livre")
232
+ await index_page(db, master)
233
+ await db.commit()
234
+
235
+ hits = await search_pages(db, "premier")
236
+ assert len(hits) == 1
237
+ assert hits[0]["page_id"] == "test-ms-001r"
238
+
239
+ @pytest.mark.asyncio
240
+ async def test_search_finds_tags(self, db: AsyncSession):
241
+ master = _make_master(tags=["apocalypse", "martyrs"])
242
+ await index_page(db, master)
243
+ await db.commit()
244
+
245
+ hits = await search_pages(db, "apocalypse")
246
+ assert len(hits) == 1
247
+
248
+ @pytest.mark.asyncio
249
+ async def test_accent_insensitive_search(self, db: AsyncSession):
250
+ master = _make_master(translation_fr="Début du récit apocalyptique")
251
+ await index_page(db, master)
252
+ await db.commit()
253
+
254
+ # Search without accents
255
+ hits = await search_pages(db, "debut")
256
+ assert len(hits) == 1
257
+
258
+ # Search with accents
259
+ hits = await search_pages(db, "début")
260
+ assert len(hits) == 1
261
+
262
+ # Search with wrong accents
263
+ hits = await search_pages(db, "recit")
264
+ assert len(hits) == 1
265
+
266
+ @pytest.mark.asyncio
267
+ async def test_case_insensitive_search(self, db: AsyncSession):
268
+ master = _make_master(diplomatic_text="Explicit Liber Primus")
269
+ await index_page(db, master)
270
+ await db.commit()
271
+
272
+ hits = await search_pages(db, "EXPLICIT")
273
+ assert len(hits) == 1
274
+
275
+ hits = await search_pages(db, "explicit")
276
+ assert len(hits) == 1
277
+
278
+ @pytest.mark.asyncio
279
+ async def test_empty_query_returns_nothing(self, db: AsyncSession):
280
+ master = _make_master()
281
+ await index_page(db, master)
282
+ await db.commit()
283
+
284
+ hits = await search_pages(db, "")
285
+ assert hits == []
286
+
287
+ hits = await search_pages(db, " ")
288
+ assert hits == []
289
+
290
+ @pytest.mark.asyncio
291
+ async def test_no_match_returns_empty(self, db: AsyncSession):
292
+ master = _make_master(diplomatic_text="Explicit liber primus")
293
+ await index_page(db, master)
294
+ await db.commit()
295
+
296
+ hits = await search_pages(db, "zzzznonexistent")
297
+ assert hits == []
298
+
299
+ @pytest.mark.asyncio
300
+ async def test_results_sorted_by_score(self, db: AsyncSession):
301
+ # Page with many occurrences
302
+ master1 = _make_master(
303
+ page_id="ms-high",
304
+ folio_label="001r",
305
+ diplomatic_text="liber liber liber liber liber",
306
+ )
307
+ # Page with fewer occurrences
308
+ master2 = _make_master(
309
+ page_id="ms-low",
310
+ folio_label="002r",
311
+ diplomatic_text="liber primus",
312
+ )
313
+ await index_page(db, master1)
314
+ await index_page(db, master2)
315
+ await db.commit()
316
+
317
+ hits = await search_pages(db, "liber")
318
+ assert len(hits) == 2
319
+ assert hits[0]["page_id"] == "ms-high"
320
+ assert hits[0]["score"] > hits[1]["score"]
321
+
322
+ @pytest.mark.asyncio
323
+ async def test_limit_parameter(self, db: AsyncSession):
324
+ # Index 5 pages
325
+ for i in range(5):
326
+ master = _make_master(
327
+ page_id=f"ms-{i:03d}r",
328
+ folio_label=f"{i:03d}r",
329
+ diplomatic_text="common text shared across all pages",
330
+ )
331
+ await index_page(db, master)
332
+ await db.commit()
333
+
334
+ hits = await search_pages(db, "common", limit=3)
335
+ assert len(hits) == 3
336
+
337
+ @pytest.mark.asyncio
338
+ async def test_excerpt_is_populated(self, db: AsyncSession):
339
+ master = _make_master(diplomatic_text="Before context Explicit liber primus after context")
340
+ await index_page(db, master)
341
+ await db.commit()
342
+
343
+ hits = await search_pages(db, "liber")
344
+ assert len(hits) == 1
345
+ assert "liber" in hits[0]["excerpt"].lower()
346
+
347
+ @pytest.mark.asyncio
348
+ async def test_search_across_multiple_fields(self, db: AsyncSession):
349
+ """A page matching in multiple fields should have a higher score."""
350
+ # Page matching in both diplomatic and translation
351
+ master1 = _make_master(
352
+ page_id="ms-multi",
353
+ diplomatic_text="liber primus",
354
+ translation_fr="liber premier",
355
+ )
356
+ # Page matching in diplomatic only
357
+ master2 = _make_master(
358
+ page_id="ms-single",
359
+ diplomatic_text="liber primus",
360
+ translation_fr="rien a voir",
361
+ )
362
+ await index_page(db, master1)
363
+ await index_page(db, master2)
364
+ await db.commit()
365
+
366
+ hits = await search_pages(db, "liber")
367
+ assert len(hits) == 2
368
+ assert hits[0]["page_id"] == "ms-multi"
369
+ assert hits[0]["score"] > hits[1]["score"]
370
+
371
+
372
+ # ── Tests reindex_all ─────────────────────────────────────────────────────────
373
+
374
+
375
+ class TestReindexAll:
376
+ @pytest.mark.asyncio
377
+ async def test_reindex_from_filesystem(self, db: AsyncSession, tmp_path: Path):
378
+ """reindex_all should read master.json files and populate the index."""
379
+ # Create a fake corpus directory structure
380
+ corpus_dir = tmp_path / "corpora" / "test-ms" / "pages" / "001r"
381
+ corpus_dir.mkdir(parents=True)
382
+
383
+ master_data = {
384
+ "schema_version": "1.0",
385
+ "page_id": "test-ms-001r",
386
+ "corpus_profile": "medieval-illuminated",
387
+ "manuscript_id": "test-ms",
388
+ "folio_label": "001r",
389
+ "sequence": 1,
390
+ "image": {
391
+ "master": "https://example.com/image.jpg",
392
+ "width": 3000,
393
+ "height": 4000,
394
+ },
395
+ "layout": {"regions": []},
396
+ "ocr": {
397
+ "diplomatic_text": "Explicit liber primus",
398
+ "language": "la",
399
+ "confidence": 0.8,
400
+ },
401
+ "translation": {"fr": "Fin du premier livre", "en": ""},
402
+ }
403
+ (corpus_dir / "master.json").write_text(
404
+ json.dumps(master_data), encoding="utf-8"
405
+ )
406
+
407
+ count = await reindex_all(db, tmp_path)
408
+ assert count == 1
409
+
410
+ # Verify the page was indexed
411
+ hits = await search_pages(db, "liber")
412
+ assert len(hits) == 1
413
+ assert hits[0]["page_id"] == "test-ms-001r"
414
+
415
+ @pytest.mark.asyncio
416
+ async def test_reindex_skips_invalid_files(self, db: AsyncSession, tmp_path: Path):
417
+ """reindex_all should skip invalid master.json files gracefully."""
418
+ corpus_dir = tmp_path / "corpora" / "test-ms" / "pages" / "bad"
419
+ corpus_dir.mkdir(parents=True)
420
+
421
+ # Write invalid JSON
422
+ (corpus_dir / "master.json").write_text("not valid json", encoding="utf-8")
423
+
424
+ count = await reindex_all(db, tmp_path)
425
+ assert count == 0
426
+
427
+ @pytest.mark.asyncio
428
+ async def test_reindex_empty_dir(self, db: AsyncSession, tmp_path: Path):
429
+ """reindex_all on an empty data dir should return 0."""
430
+ count = await reindex_all(db, tmp_path)
431
+ assert count == 0
432
+
433
+ @pytest.mark.asyncio
434
+ async def test_reindex_multiple_pages(self, db: AsyncSession, tmp_path: Path):
435
+ """reindex_all with multiple valid master.json files."""
436
+ for folio in ["001r", "002r", "003r"]:
437
+ page_dir = tmp_path / "corpora" / "test-ms" / "pages" / folio
438
+ page_dir.mkdir(parents=True)
439
+ data = {
440
+ "schema_version": "1.0",
441
+ "page_id": f"test-ms-{folio}",
442
+ "corpus_profile": "medieval-illuminated",
443
+ "manuscript_id": "test-ms",
444
+ "folio_label": folio,
445
+ "sequence": int(folio[:3]),
446
+ "image": {
447
+ "master": "https://example.com/image.jpg",
448
+ "width": 3000,
449
+ "height": 4000,
450
+ },
451
+ "layout": {"regions": []},
452
+ "ocr": {
453
+ "diplomatic_text": f"Text for folio {folio}",
454
+ "language": "la",
455
+ "confidence": 0.8,
456
+ },
457
+ }
458
+ (page_dir / "master.json").write_text(
459
+ json.dumps(data), encoding="utf-8"
460
+ )
461
+
462
+ count = await reindex_all(db, tmp_path)
463
+ assert count == 3