Spaces:
Build error
fix(sprint-f3): conformité spec — IIIF langue, ai_raw.json, prompts génériques
Browse filesSprint F3 — alignement avec CLAUDE.md et correction des exports :
- IIIF : tag langue "none" → "en" (défaut conforme IIIF Presentation 3.0)
Canvas labels utilisent la langue du manuscrit au lieu de "none"
- ALTO : commentaire clarifié (TextBlock vide déjà correct — pas de data loss)
- METS : warning logger si fichier ALTO référencé n'existe pas encore
- Renommage gemini_raw.json → ai_raw.json (conforme CLAUDE.md §3)
Fonction write_gemini_raw() → write_ai_raw() dans master_writer.py
- Prompts : langue hardcodée ("la"/"fr") → {{primary_language}}
Nouvelle variable primary_language dans le contexte de rendu
- IIIF fetcher : Referer Gallica hardcodé retiré (code générique)
477 tests passants, 0 échecs.
https://claude.ai/code/session_015Lht7wNQRzhUaLw94dE9z9
- backend/app/services/ai/analyzer.py +7 -6
- backend/app/services/ai/master_writer.py +6 -6
- backend/app/services/export/alto.py +2 -2
- backend/app/services/export/iiif.py +2 -2
- backend/app/services/export/mets.py +6 -1
- backend/app/services/ingest/iiif_fetcher.py +0 -1
- backend/tests/test_ai_analyzer.py +21 -21
- backend/tests/test_export_alto.py +1 -1
- backend/tests/test_export_iiif.py +5 -5
- backend/tests/test_export_mets.py +1 -1
- backend/tests/test_image_pipeline.py +0 -1
- prompts/early-modern-print/primary_v1.txt +1 -1
- prompts/medieval-illuminated/primary_v1.txt +1 -1
- prompts/medieval-textual/primary_v1.txt +1 -1
- prompts/modern-handwritten/primary_v1.txt +1 -1
|
@@ -14,7 +14,7 @@ from app.schemas.corpus_profile import CorpusProfile
|
|
| 14 |
from app.schemas.image import ImageDerivativeInfo
|
| 15 |
from app.schemas.model_config import ModelConfig
|
| 16 |
from app.schemas.page_master import EditorialInfo, EditorialStatus, ImageInfo, PageMaster, ProcessingInfo
|
| 17 |
-
from app.services.ai.master_writer import
|
| 18 |
from app.services.ai.model_registry import get_provider
|
| 19 |
from app.services.ai.prompt_loader import load_and_render_prompt
|
| 20 |
from app.services.ai.response_parser import ParseError, parse_ai_response # noqa: F401
|
|
@@ -37,7 +37,7 @@ def run_primary_analysis(
|
|
| 37 |
) -> PageMaster:
|
| 38 |
"""Analyse primaire d'un folio : charge le prompt, appelle l'IA, écrit les fichiers.
|
| 39 |
|
| 40 |
-
Respecte R05 :
|
| 41 |
d'erreur de parsing. master.json n'est écrit QUE si le parsing a réussi.
|
| 42 |
|
| 43 |
Le provider est sélectionné dynamiquement depuis model_config.provider ;
|
|
@@ -57,7 +57,7 @@ def run_primary_analysis(
|
|
| 57 |
project_root: racine du projet (pour résoudre les chemins des prompts).
|
| 58 |
|
| 59 |
Returns:
|
| 60 |
-
PageMaster validé (
|
| 61 |
|
| 62 |
Raises:
|
| 63 |
ParseError: si la réponse IA n'est pas un JSON valide.
|
|
@@ -66,7 +66,7 @@ def run_primary_analysis(
|
|
| 66 |
"""
|
| 67 |
# ── Chemins de sortie ───────────────────────────────────────────────────
|
| 68 |
page_dir = base_data_dir / "corpora" / corpus_slug / "pages" / folio_label
|
| 69 |
-
raw_path = page_dir / "
|
| 70 |
master_path = page_dir / "master.json"
|
| 71 |
|
| 72 |
# ── 1. Chargement et rendu du prompt (R04) ──────────────────────────────
|
|
@@ -76,6 +76,7 @@ def run_primary_analysis(
|
|
| 76 |
context = {
|
| 77 |
"profile_label": corpus_profile.label,
|
| 78 |
"language_hints": ", ".join(corpus_profile.language_hints),
|
|
|
|
| 79 |
"script_type": corpus_profile.script_type.value,
|
| 80 |
}
|
| 81 |
prompt_text = load_and_render_prompt(prompt_abs_path, context)
|
|
@@ -109,8 +110,8 @@ def run_primary_analysis(
|
|
| 109 |
model_id=model_config.selected_model_id,
|
| 110 |
)
|
| 111 |
|
| 112 |
-
# ── 4. Écriture
|
| 113 |
-
|
| 114 |
|
| 115 |
# ── 5. Parsing + validation (ParseError si JSON invalide) ───────────────
|
| 116 |
layout, ocr = parse_ai_response(raw_text)
|
|
|
|
| 14 |
from app.schemas.image import ImageDerivativeInfo
|
| 15 |
from app.schemas.model_config import ModelConfig
|
| 16 |
from app.schemas.page_master import EditorialInfo, EditorialStatus, ImageInfo, PageMaster, ProcessingInfo
|
| 17 |
+
from app.services.ai.master_writer import write_ai_raw, write_master_json
|
| 18 |
from app.services.ai.model_registry import get_provider
|
| 19 |
from app.services.ai.prompt_loader import load_and_render_prompt
|
| 20 |
from app.services.ai.response_parser import ParseError, parse_ai_response # noqa: F401
|
|
|
|
| 37 |
) -> PageMaster:
|
| 38 |
"""Analyse primaire d'un folio : charge le prompt, appelle l'IA, écrit les fichiers.
|
| 39 |
|
| 40 |
+
Respecte R05 : ai_raw.json est toujours écrit en premier, même en cas
|
| 41 |
d'erreur de parsing. master.json n'est écrit QUE si le parsing a réussi.
|
| 42 |
|
| 43 |
Le provider est sélectionné dynamiquement depuis model_config.provider ;
|
|
|
|
| 57 |
project_root: racine du projet (pour résoudre les chemins des prompts).
|
| 58 |
|
| 59 |
Returns:
|
| 60 |
+
PageMaster validé (ai_raw.json et master.json écrits sur disque).
|
| 61 |
|
| 62 |
Raises:
|
| 63 |
ParseError: si la réponse IA n'est pas un JSON valide.
|
|
|
|
| 66 |
"""
|
| 67 |
# ── Chemins de sortie ───────────────────────────────────────────────────
|
| 68 |
page_dir = base_data_dir / "corpora" / corpus_slug / "pages" / folio_label
|
| 69 |
+
raw_path = page_dir / "ai_raw.json"
|
| 70 |
master_path = page_dir / "master.json"
|
| 71 |
|
| 72 |
# ── 1. Chargement et rendu du prompt (R04) ──────────────────────────────
|
|
|
|
| 76 |
context = {
|
| 77 |
"profile_label": corpus_profile.label,
|
| 78 |
"language_hints": ", ".join(corpus_profile.language_hints),
|
| 79 |
+
"primary_language": corpus_profile.language_hints[0] if corpus_profile.language_hints else "la",
|
| 80 |
"script_type": corpus_profile.script_type.value,
|
| 81 |
}
|
| 82 |
prompt_text = load_and_render_prompt(prompt_abs_path, context)
|
|
|
|
| 110 |
model_id=model_config.selected_model_id,
|
| 111 |
)
|
| 112 |
|
| 113 |
+
# ── 4. Écriture ai_raw.json TOUJOURS EN PREMIER (R05) ─────────────────
|
| 114 |
+
write_ai_raw(raw_text, raw_path)
|
| 115 |
|
| 116 |
# ── 5. Parsing + validation (ParseError si JSON invalide) ───────────────
|
| 117 |
layout, ocr = parse_ai_response(raw_text)
|
|
@@ -1,8 +1,8 @@
|
|
| 1 |
"""
|
| 2 |
-
Écriture des fichiers
|
| 3 |
|
| 4 |
Règle R05 non négociable :
|
| 5 |
-
1.
|
| 6 |
2. master.json n'est écrit QUE si le parsing et la validation Pydantic ont réussi.
|
| 7 |
"""
|
| 8 |
# 1. stdlib
|
|
@@ -16,8 +16,8 @@ from app.schemas.page_master import PageMaster
|
|
| 16 |
logger = logging.getLogger(__name__)
|
| 17 |
|
| 18 |
|
| 19 |
-
def
|
| 20 |
-
"""Écrit la réponse brute de l'IA dans
|
| 21 |
|
| 22 |
Toujours appelé AVANT toute tentative de parsing.
|
| 23 |
Le contenu est enveloppé dans un objet JSON pour garantir un fichier valide,
|
|
@@ -31,9 +31,9 @@ def write_gemini_raw(raw_text: str, output_path: Path) -> None:
|
|
| 31 |
encoding="utf-8",
|
| 32 |
)
|
| 33 |
except OSError as exc:
|
| 34 |
-
logger.error("Écriture
|
| 35 |
raise
|
| 36 |
-
logger.info("
|
| 37 |
|
| 38 |
|
| 39 |
def write_master_json(page_master: PageMaster, output_path: Path) -> None:
|
|
|
|
| 1 |
"""
|
| 2 |
+
Écriture des fichiers ai_raw.json et master.json (R02, R05).
|
| 3 |
|
| 4 |
Règle R05 non négociable :
|
| 5 |
+
1. ai_raw.json est TOUJOURS écrit en premier.
|
| 6 |
2. master.json n'est écrit QUE si le parsing et la validation Pydantic ont réussi.
|
| 7 |
"""
|
| 8 |
# 1. stdlib
|
|
|
|
| 16 |
logger = logging.getLogger(__name__)
|
| 17 |
|
| 18 |
|
| 19 |
+
def write_ai_raw(raw_text: str, output_path: Path) -> None:
|
| 20 |
+
"""Écrit la réponse brute de l'IA dans ai_raw.json (R05).
|
| 21 |
|
| 22 |
Toujours appelé AVANT toute tentative de parsing.
|
| 23 |
Le contenu est enveloppé dans un objet JSON pour garantir un fichier valide,
|
|
|
|
| 31 |
encoding="utf-8",
|
| 32 |
)
|
| 33 |
except OSError as exc:
|
| 34 |
+
logger.error("Écriture ai_raw.json échouée", extra={"path": str(output_path), "error": str(exc)})
|
| 35 |
raise
|
| 36 |
+
logger.info("ai_raw.json écrit", extra={"path": str(output_path)})
|
| 37 |
|
| 38 |
|
| 39 |
def write_master_json(page_master: PageMaster, output_path: Path) -> None:
|
|
@@ -1,7 +1,7 @@
|
|
| 1 |
"""
|
| 2 |
Générateur ALTO v4 depuis un PageMaster validé (R02).
|
| 3 |
|
| 4 |
-
Source canonique : PageMaster uniquement — jamais la réponse brute
|
| 5 |
bbox [x, y, width, height] → HPOS / VPOS / WIDTH / HEIGHT (correspondance directe).
|
| 6 |
|
| 7 |
Mapping RegionType → élément ALTO :
|
|
@@ -82,7 +82,7 @@ def _build_text_block(
|
|
| 82 |
text = fallback_text
|
| 83 |
|
| 84 |
if not text:
|
| 85 |
-
return # TextBlock
|
| 86 |
|
| 87 |
x, y, w, h = region.bbox
|
| 88 |
line_el = etree.SubElement(
|
|
|
|
| 1 |
"""
|
| 2 |
Générateur ALTO v4 depuis un PageMaster validé (R02).
|
| 3 |
|
| 4 |
+
Source canonique : PageMaster uniquement — jamais la réponse brute ai_raw.json.
|
| 5 |
bbox [x, y, width, height] → HPOS / VPOS / WIDTH / HEIGHT (correspondance directe).
|
| 6 |
|
| 7 |
Mapping RegionType → élément ALTO :
|
|
|
|
| 82 |
text = fallback_text
|
| 83 |
|
| 84 |
if not text:
|
| 85 |
+
return # TextBlock sans TextLine — valide ALTO, région visible dans le layout
|
| 86 |
|
| 87 |
x, y, w, h = region.bbox
|
| 88 |
line_el = etree.SubElement(
|
|
@@ -74,7 +74,7 @@ def generate_manifest(
|
|
| 74 |
|
| 75 |
manuscript_id = manuscript_meta["manuscript_id"]
|
| 76 |
label = manuscript_meta["label"]
|
| 77 |
-
language = manuscript_meta.get("language") or "
|
| 78 |
|
| 79 |
# Pages dans l'ordre de séquence (règle absolue — structMap PHYSICAL)
|
| 80 |
pages = sorted(masters, key=lambda m: m.sequence)
|
|
@@ -112,7 +112,7 @@ def generate_manifest(
|
|
| 112 |
canvas: dict = {
|
| 113 |
"id": canvas_id,
|
| 114 |
"type": "Canvas",
|
| 115 |
-
"label": {
|
| 116 |
"width": width,
|
| 117 |
"height": height,
|
| 118 |
"items": [
|
|
|
|
| 74 |
|
| 75 |
manuscript_id = manuscript_meta["manuscript_id"]
|
| 76 |
label = manuscript_meta["label"]
|
| 77 |
+
language = manuscript_meta.get("language") or "en"
|
| 78 |
|
| 79 |
# Pages dans l'ordre de séquence (règle absolue — structMap PHYSICAL)
|
| 80 |
pages = sorted(masters, key=lambda m: m.sequence)
|
|
|
|
| 112 |
canvas: dict = {
|
| 113 |
"id": canvas_id,
|
| 114 |
"type": "Canvas",
|
| 115 |
+
"label": {language: [f"Folio {page.folio_label}"]},
|
| 116 |
"width": width,
|
| 117 |
"height": height,
|
| 118 |
"items": [
|
|
@@ -195,8 +195,13 @@ def generate_mets(
|
|
| 195 |
f"{_XL}type": "simple",
|
| 196 |
})
|
| 197 |
|
| 198 |
-
# ALTO
|
| 199 |
alto_p = _alto_path(corpus_slug, page.folio_label, base_data_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
f_alto = _el(grp_alto, f"{_M}file", {"ID": f"ALTO_{sid}", "MIMETYPE": "text/xml"})
|
| 201 |
_el(f_alto, f"{_M}FLocat", {
|
| 202 |
"LOCTYPE": "OTHER",
|
|
|
|
| 195 |
f"{_XL}type": "simple",
|
| 196 |
})
|
| 197 |
|
| 198 |
+
# ALTO (référence conditionnelle — warning si le fichier n'existe pas encore)
|
| 199 |
alto_p = _alto_path(corpus_slug, page.folio_label, base_data_dir)
|
| 200 |
+
if not Path(alto_p).exists():
|
| 201 |
+
logger.warning(
|
| 202 |
+
"Fichier ALTO absent — la référence METS sera cassée tant que l'ALTO n'est pas généré",
|
| 203 |
+
extra={"alto_path": alto_p, "page_id": page.page_id},
|
| 204 |
+
)
|
| 205 |
f_alto = _el(grp_alto, f"{_M}file", {"ID": f"ALTO_{sid}", "MIMETYPE": "text/xml"})
|
| 206 |
_el(f_alto, f"{_M}FLocat", {
|
| 207 |
"LOCTYPE": "OTHER",
|
|
@@ -17,7 +17,6 @@ _HEADERS = {
|
|
| 17 |
"+https://huggingface.co/spaces/Ma-Ri-Ba-Ku/scriptorium-ai)"
|
| 18 |
),
|
| 19 |
"Accept": "image/jpeg,image/png,image/*,*/*",
|
| 20 |
-
"Referer": "https://gallica.bnf.fr/",
|
| 21 |
}
|
| 22 |
|
| 23 |
|
|
|
|
| 17 |
"+https://huggingface.co/spaces/Ma-Ri-Ba-Ku/scriptorium-ai)"
|
| 18 |
),
|
| 19 |
"Accept": "image/jpeg,image/png,image/*,*/*",
|
|
|
|
| 20 |
}
|
| 21 |
|
| 22 |
|
|
@@ -3,7 +3,7 @@ Tests du pipeline d'analyse IA :
|
|
| 3 |
- prompt_loader : chargement + rendu des templates
|
| 4 |
- client_factory : construction du genai.Client selon le provider
|
| 5 |
- response_parser: parsing JSON brut → layout + OCRResult
|
| 6 |
-
- master_writer : écriture
|
| 7 |
- analyzer : run_primary_analysis (end-to-end mocké)
|
| 8 |
"""
|
| 9 |
# 1. stdlib
|
|
@@ -31,7 +31,7 @@ from app.schemas.model_config import ModelConfig, ProviderType
|
|
| 31 |
from app.schemas.page_master import OCRResult, PageMaster
|
| 32 |
from app.services.ai.analyzer import run_primary_analysis
|
| 33 |
from app.services.ai.client_factory import build_client
|
| 34 |
-
from app.services.ai.master_writer import
|
| 35 |
from app.services.ai.prompt_loader import load_and_render_prompt
|
| 36 |
from app.services.ai.response_parser import ParseError, parse_ai_response
|
| 37 |
|
|
@@ -390,35 +390,35 @@ def test_parse_empty_regions_list():
|
|
| 390 |
|
| 391 |
|
| 392 |
# ---------------------------------------------------------------------------
|
| 393 |
-
# Tests —
|
| 394 |
# ---------------------------------------------------------------------------
|
| 395 |
|
| 396 |
-
def
|
| 397 |
-
out = tmp_path / "page" / "
|
| 398 |
-
|
| 399 |
|
| 400 |
assert out.exists()
|
| 401 |
|
| 402 |
|
| 403 |
-
def
|
| 404 |
-
out = tmp_path / "
|
| 405 |
-
|
| 406 |
|
| 407 |
content = json.loads(out.read_text(encoding="utf-8"))
|
| 408 |
assert "response_text" in content
|
| 409 |
assert content["response_text"] == '{"not": "valid json from AI"}'
|
| 410 |
|
| 411 |
|
| 412 |
-
def
|
| 413 |
-
out = tmp_path / "deep" / "nested" / "dir" / "
|
| 414 |
-
|
| 415 |
assert out.exists()
|
| 416 |
|
| 417 |
|
| 418 |
-
def
|
| 419 |
-
"""Même si le texte brut est invalide,
|
| 420 |
-
out = tmp_path / "
|
| 421 |
-
|
| 422 |
|
| 423 |
content = json.loads(out.read_text(encoding="utf-8"))
|
| 424 |
assert content["response_text"] == "this is not json at all"
|
|
@@ -444,7 +444,7 @@ def _make_page_master() -> PageMaster:
|
|
| 444 |
"model_id": "gemini-2.0-flash",
|
| 445 |
"model_display_name": "Gemini 2.0 Flash",
|
| 446 |
"prompt_version": "prompts/medieval-illuminated/primary_v1.txt",
|
| 447 |
-
"raw_response_path": "/data/
|
| 448 |
"processed_at": datetime.now(tz=timezone.utc),
|
| 449 |
},
|
| 450 |
)
|
|
@@ -569,12 +569,12 @@ def test_run_primary_analysis_files_created(tmp_path):
|
|
| 569 |
)
|
| 570 |
|
| 571 |
page_dir = tmp_path / "data" / "corpora" / "test-corpus" / "pages" / "0001r"
|
| 572 |
-
assert (page_dir / "
|
| 573 |
assert (page_dir / "master.json").exists()
|
| 574 |
|
| 575 |
|
| 576 |
def test_run_primary_analysis_raw_written_before_parse(tmp_path):
|
| 577 |
-
"""
|
| 578 |
prompt_rel = "prompts/medieval-illuminated/primary_v1.txt"
|
| 579 |
_setup_prompt_file(tmp_path, prompt_rel)
|
| 580 |
deriv_path = _setup_derivative(tmp_path)
|
|
@@ -597,8 +597,8 @@ def test_run_primary_analysis_raw_written_before_parse(tmp_path):
|
|
| 597 |
project_root=tmp_path,
|
| 598 |
)
|
| 599 |
|
| 600 |
-
#
|
| 601 |
-
raw_path = tmp_path / "data" / "corpora" / "test-corpus" / "pages" / "0001r" / "
|
| 602 |
assert raw_path.exists()
|
| 603 |
|
| 604 |
# master.json N'existe PAS (parsing a échoué)
|
|
|
|
| 3 |
- prompt_loader : chargement + rendu des templates
|
| 4 |
- client_factory : construction du genai.Client selon le provider
|
| 5 |
- response_parser: parsing JSON brut → layout + OCRResult
|
| 6 |
+
- master_writer : écriture ai_raw.json et master.json
|
| 7 |
- analyzer : run_primary_analysis (end-to-end mocké)
|
| 8 |
"""
|
| 9 |
# 1. stdlib
|
|
|
|
| 31 |
from app.schemas.page_master import OCRResult, PageMaster
|
| 32 |
from app.services.ai.analyzer import run_primary_analysis
|
| 33 |
from app.services.ai.client_factory import build_client
|
| 34 |
+
from app.services.ai.master_writer import write_ai_raw, write_master_json
|
| 35 |
from app.services.ai.prompt_loader import load_and_render_prompt
|
| 36 |
from app.services.ai.response_parser import ParseError, parse_ai_response
|
| 37 |
|
|
|
|
| 390 |
|
| 391 |
|
| 392 |
# ---------------------------------------------------------------------------
|
| 393 |
+
# Tests — write_ai_raw / write_master_json
|
| 394 |
# ---------------------------------------------------------------------------
|
| 395 |
|
| 396 |
+
def test_write_ai_raw_creates_file(tmp_path):
|
| 397 |
+
out = tmp_path / "page" / "ai_raw.json"
|
| 398 |
+
write_ai_raw("raw AI text here", out)
|
| 399 |
|
| 400 |
assert out.exists()
|
| 401 |
|
| 402 |
|
| 403 |
+
def test_write_ai_raw_valid_json(tmp_path):
|
| 404 |
+
out = tmp_path / "ai_raw.json"
|
| 405 |
+
write_ai_raw('{"not": "valid json from AI"}', out)
|
| 406 |
|
| 407 |
content = json.loads(out.read_text(encoding="utf-8"))
|
| 408 |
assert "response_text" in content
|
| 409 |
assert content["response_text"] == '{"not": "valid json from AI"}'
|
| 410 |
|
| 411 |
|
| 412 |
+
def test_write_ai_raw_creates_parent_dirs(tmp_path):
|
| 413 |
+
out = tmp_path / "deep" / "nested" / "dir" / "ai_raw.json"
|
| 414 |
+
write_ai_raw("text", out)
|
| 415 |
assert out.exists()
|
| 416 |
|
| 417 |
|
| 418 |
+
def test_write_ai_raw_with_non_json_text(tmp_path):
|
| 419 |
+
"""Même si le texte brut est invalide, ai_raw.json est créé."""
|
| 420 |
+
out = tmp_path / "ai_raw.json"
|
| 421 |
+
write_ai_raw("this is not json at all", out)
|
| 422 |
|
| 423 |
content = json.loads(out.read_text(encoding="utf-8"))
|
| 424 |
assert content["response_text"] == "this is not json at all"
|
|
|
|
| 444 |
"model_id": "gemini-2.0-flash",
|
| 445 |
"model_display_name": "Gemini 2.0 Flash",
|
| 446 |
"prompt_version": "prompts/medieval-illuminated/primary_v1.txt",
|
| 447 |
+
"raw_response_path": "/data/ai_raw.json",
|
| 448 |
"processed_at": datetime.now(tz=timezone.utc),
|
| 449 |
},
|
| 450 |
)
|
|
|
|
| 569 |
)
|
| 570 |
|
| 571 |
page_dir = tmp_path / "data" / "corpora" / "test-corpus" / "pages" / "0001r"
|
| 572 |
+
assert (page_dir / "ai_raw.json").exists()
|
| 573 |
assert (page_dir / "master.json").exists()
|
| 574 |
|
| 575 |
|
| 576 |
def test_run_primary_analysis_raw_written_before_parse(tmp_path):
|
| 577 |
+
"""ai_raw.json est écrit AVANT que le parsing échoue (R05)."""
|
| 578 |
prompt_rel = "prompts/medieval-illuminated/primary_v1.txt"
|
| 579 |
_setup_prompt_file(tmp_path, prompt_rel)
|
| 580 |
deriv_path = _setup_derivative(tmp_path)
|
|
|
|
| 597 |
project_root=tmp_path,
|
| 598 |
)
|
| 599 |
|
| 600 |
+
# ai_raw.json existe malgré l'échec de parsing
|
| 601 |
+
raw_path = tmp_path / "data" / "corpora" / "test-corpus" / "pages" / "0001r" / "ai_raw.json"
|
| 602 |
assert raw_path.exists()
|
| 603 |
|
| 604 |
# master.json N'existe PAS (parsing a échoué)
|
|
@@ -56,7 +56,7 @@ def _make_master(
|
|
| 56 |
model_id="gemini-2.0-flash",
|
| 57 |
model_display_name="Gemini 2.0 Flash",
|
| 58 |
prompt_version="prompts/medieval-illuminated/primary_v1.txt",
|
| 59 |
-
raw_response_path="/data/
|
| 60 |
processed_at=datetime(2024, 6, 15, 12, 0, 0, tzinfo=timezone.utc),
|
| 61 |
)
|
| 62 |
return PageMaster(
|
|
|
|
| 56 |
model_id="gemini-2.0-flash",
|
| 57 |
model_display_name="Gemini 2.0 Flash",
|
| 58 |
prompt_version="prompts/medieval-illuminated/primary_v1.txt",
|
| 59 |
+
raw_response_path="/data/ai_raw.json",
|
| 60 |
processed_at=datetime(2024, 6, 15, 12, 0, 0, tzinfo=timezone.utc),
|
| 61 |
)
|
| 62 |
return PageMaster(
|
|
@@ -193,11 +193,11 @@ def test_manifest_label_uses_language_key(simple_manifest):
|
|
| 193 |
|
| 194 |
|
| 195 |
def test_manifest_label_without_language_uses_none():
|
| 196 |
-
"""Sans champ language, la clé de label est '
|
| 197 |
pages = [_make_page("ms-0001r", "0001r", 1)]
|
| 198 |
-
meta = _base_meta() # pas de language
|
| 199 |
manifest = generate_manifest(pages, meta, _BASE_URL)
|
| 200 |
-
assert "
|
| 201 |
|
| 202 |
|
| 203 |
def test_manifest_label_fr(chroniques_pages, chroniques_meta):
|
|
@@ -272,7 +272,7 @@ def test_canvas_order_respects_sequence():
|
|
| 272 |
_make_page("ms-f002r", "f002r", 2),
|
| 273 |
]
|
| 274 |
manifest = generate_manifest(pages, _base_meta(), _BASE_URL)
|
| 275 |
-
labels = [c["label"]["
|
| 276 |
assert labels == ["Folio f001r", "Folio f002r", "Folio f003r"]
|
| 277 |
|
| 278 |
|
|
@@ -283,7 +283,7 @@ def test_canvas_order_large_sequence():
|
|
| 283 |
random.shuffle(pages)
|
| 284 |
manifest = generate_manifest(pages, _base_meta(), _BASE_URL)
|
| 285 |
sequences_in_label = [
|
| 286 |
-
int(c["label"]["
|
| 287 |
for c in manifest["items"]
|
| 288 |
]
|
| 289 |
assert sequences_in_label == list(range(1, 11))
|
|
|
|
| 193 |
|
| 194 |
|
| 195 |
def test_manifest_label_without_language_uses_none():
|
| 196 |
+
"""Sans champ language, la clé de label est 'en' (défaut IIIF-compliant)."""
|
| 197 |
pages = [_make_page("ms-0001r", "0001r", 1)]
|
| 198 |
+
meta = _base_meta() # pas de language → défaut "en"
|
| 199 |
manifest = generate_manifest(pages, meta, _BASE_URL)
|
| 200 |
+
assert "en" in manifest["label"]
|
| 201 |
|
| 202 |
|
| 203 |
def test_manifest_label_fr(chroniques_pages, chroniques_meta):
|
|
|
|
| 272 |
_make_page("ms-f002r", "f002r", 2),
|
| 273 |
]
|
| 274 |
manifest = generate_manifest(pages, _base_meta(), _BASE_URL)
|
| 275 |
+
labels = [c["label"]["en"][0] for c in manifest["items"]]
|
| 276 |
assert labels == ["Folio f001r", "Folio f002r", "Folio f003r"]
|
| 277 |
|
| 278 |
|
|
|
|
| 283 |
random.shuffle(pages)
|
| 284 |
manifest = generate_manifest(pages, _base_meta(), _BASE_URL)
|
| 285 |
sequences_in_label = [
|
| 286 |
+
int(c["label"]["en"][0].replace("Folio f", "").replace("r", ""))
|
| 287 |
for c in manifest["items"]
|
| 288 |
]
|
| 289 |
assert sequences_in_label == list(range(1, 11))
|
|
@@ -70,7 +70,7 @@ def _make_page(
|
|
| 70 |
model_id="gemini-2.0-flash",
|
| 71 |
model_display_name="Gemini 2.0 Flash",
|
| 72 |
prompt_version="prompts/medieval-illuminated/primary_v1.txt",
|
| 73 |
-
raw_response_path=f"/data/corpora/test/pages/{folio_label}/
|
| 74 |
processed_at=datetime(2024, 6, 15, 12, 0, 0, tzinfo=timezone.utc),
|
| 75 |
)
|
| 76 |
ocr = OCRResult(diplomatic_text=ocr_text, language="la", confidence=0.90) if ocr_text else None
|
|
|
|
| 70 |
model_id="gemini-2.0-flash",
|
| 71 |
model_display_name="Gemini 2.0 Flash",
|
| 72 |
prompt_version="prompts/medieval-illuminated/primary_v1.txt",
|
| 73 |
+
raw_response_path=f"/data/corpora/test/pages/{folio_label}/ai_raw.json",
|
| 74 |
processed_at=datetime(2024, 6, 15, 12, 0, 0, tzinfo=timezone.utc),
|
| 75 |
)
|
| 76 |
ocr = OCRResult(diplomatic_text=ocr_text, language="la", confidence=0.90) if ocr_text else None
|
|
@@ -278,7 +278,6 @@ def test_fetch_iiif_image_success():
|
|
| 278 |
"+https://huggingface.co/spaces/Ma-Ri-Ba-Ku/scriptorium-ai)"
|
| 279 |
),
|
| 280 |
"Accept": "image/jpeg,image/png,image/*,*/*",
|
| 281 |
-
"Referer": "https://gallica.bnf.fr/",
|
| 282 |
},
|
| 283 |
follow_redirects=True,
|
| 284 |
timeout=60.0,
|
|
|
|
| 278 |
"+https://huggingface.co/spaces/Ma-Ri-Ba-Ku/scriptorium-ai)"
|
| 279 |
),
|
| 280 |
"Accept": "image/jpeg,image/png,image/*,*/*",
|
|
|
|
| 281 |
},
|
| 282 |
follow_redirects=True,
|
| 283 |
timeout=60.0,
|
|
@@ -28,7 +28,7 @@ Format de sortie JSON attendu :
|
|
| 28 |
"diplomatic_text": "",
|
| 29 |
"blocks": [],
|
| 30 |
"lines": [],
|
| 31 |
-
"language": "
|
| 32 |
"confidence": 0.0,
|
| 33 |
"uncertain_segments": []
|
| 34 |
}
|
|
|
|
| 28 |
"diplomatic_text": "",
|
| 29 |
"blocks": [],
|
| 30 |
"lines": [],
|
| 31 |
+
"language": "{{primary_language}}",
|
| 32 |
"confidence": 0.0,
|
| 33 |
"uncertain_segments": []
|
| 34 |
}
|
|
@@ -28,7 +28,7 @@ Format de sortie JSON attendu :
|
|
| 28 |
"diplomatic_text": "",
|
| 29 |
"blocks": [],
|
| 30 |
"lines": [],
|
| 31 |
-
"language": "
|
| 32 |
"confidence": 0.0,
|
| 33 |
"uncertain_segments": []
|
| 34 |
}
|
|
|
|
| 28 |
"diplomatic_text": "",
|
| 29 |
"blocks": [],
|
| 30 |
"lines": [],
|
| 31 |
+
"language": "{{primary_language}}",
|
| 32 |
"confidence": 0.0,
|
| 33 |
"uncertain_segments": []
|
| 34 |
}
|
|
@@ -28,7 +28,7 @@ Format de sortie JSON attendu :
|
|
| 28 |
"diplomatic_text": "",
|
| 29 |
"blocks": [],
|
| 30 |
"lines": [],
|
| 31 |
-
"language": "
|
| 32 |
"confidence": 0.0,
|
| 33 |
"uncertain_segments": []
|
| 34 |
}
|
|
|
|
| 28 |
"diplomatic_text": "",
|
| 29 |
"blocks": [],
|
| 30 |
"lines": [],
|
| 31 |
+
"language": "{{primary_language}}",
|
| 32 |
"confidence": 0.0,
|
| 33 |
"uncertain_segments": []
|
| 34 |
}
|
|
@@ -28,7 +28,7 @@ Format de sortie JSON attendu :
|
|
| 28 |
"diplomatic_text": "",
|
| 29 |
"blocks": [],
|
| 30 |
"lines": [],
|
| 31 |
-
"language": "
|
| 32 |
"confidence": 0.0,
|
| 33 |
"uncertain_segments": []
|
| 34 |
}
|
|
|
|
| 28 |
"diplomatic_text": "",
|
| 29 |
"blocks": [],
|
| 30 |
"lines": [],
|
| 31 |
+
"language": "{{primary_language}}",
|
| 32 |
"confidence": 0.0,
|
| 33 |
"uncertain_segments": []
|
| 34 |
}
|